Skip to content

Commit

Permalink
Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next
Browse files Browse the repository at this point in the history
Pablo Neira Ayuso says:

====================
Netfilter/IPVS updates for net-next

1) Remove indirection and use nf_ct_get() instead from nfnetlink_log
   and nfnetlink_queue, from Florian Westphal.

2) Add weighted random twos choice least-connection scheduling for IPVS,
   from Darby Payne.

3) Add a __hash placeholder in the flow tuple structure to identify
   the field to be included in the rhashtable key hash calculation.

4) Add a new nft_parse_register_load() and nft_parse_register_store()
   to consolidate register load and store in the core.

5) Statify nft_parse_register() since it has no more module clients.

6) Remove redundant assignment in nft_cmp, from Colin Ian King.

* git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next:
  netfilter: nftables: remove redundant assignment of variable err
  netfilter: nftables: statify nft_parse_register()
  netfilter: nftables: add nft_parse_register_store() and use it
  netfilter: nftables: add nft_parse_register_load() and use it
  netfilter: flowtable: add hash offset field to tuple
  ipvs: add weighted random twos choice algorithm
  netfilter: ctnetlink: remove get_ct indirection
====================

Link: https://lore.kernel.org/r/20210206015005.23037-1-pablo@netfilter.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
kuba-moo committed Feb 6, 2021
2 parents 7274c41 + 626899a commit c273a20
Show file tree
Hide file tree
Showing 44 changed files with 406 additions and 247 deletions.
2 changes: 0 additions & 2 deletions include/linux/netfilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -463,8 +463,6 @@ extern struct nf_ct_hook __rcu *nf_ct_hook;
struct nlattr;

struct nfnl_ct_hook {
struct nf_conn *(*get_ct)(const struct sk_buff *skb,
enum ip_conntrack_info *ctinfo);
size_t (*build_size)(const struct nf_conn *ct);
int (*build)(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
Expand Down
4 changes: 4 additions & 0 deletions include/net/netfilter/nf_flow_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ struct flow_offload_tuple {

u8 l3proto;
u8 l4proto;

/* All members above are keys for lookups, see flow_offload_hash(). */
struct { } __hash;

u8 dir;

u16 mtu;
Expand Down
11 changes: 5 additions & 6 deletions include/net/netfilter/nf_tables.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,14 +200,13 @@ static inline enum nft_registers nft_type_to_reg(enum nft_data_types type)
}

int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest);
unsigned int nft_parse_register(const struct nlattr *attr);
int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg);

int nft_validate_register_load(enum nft_registers reg, unsigned int len);
int nft_validate_register_store(const struct nft_ctx *ctx,
enum nft_registers reg,
const struct nft_data *data,
enum nft_data_types type, unsigned int len);
int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len);
int nft_parse_register_store(const struct nft_ctx *ctx,
const struct nlattr *attr, u8 *dreg,
const struct nft_data *data,
enum nft_data_types type, unsigned int len);

/**
* struct nft_userdata - user defined data associated with an object
Expand Down
12 changes: 6 additions & 6 deletions include/net/netfilter/nf_tables_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,21 @@ void nf_tables_core_module_exit(void);
struct nft_bitwise_fast_expr {
u32 mask;
u32 xor;
enum nft_registers sreg:8;
enum nft_registers dreg:8;
u8 sreg;
u8 dreg;
};

struct nft_cmp_fast_expr {
u32 data;
u32 mask;
enum nft_registers sreg:8;
u8 sreg;
u8 len;
bool inv;
};

struct nft_immediate_expr {
struct nft_data data;
enum nft_registers dreg:8;
u8 dreg;
u8 dlen;
};

Expand All @@ -60,14 +60,14 @@ struct nft_payload {
enum nft_payload_bases base:8;
u8 offset;
u8 len;
enum nft_registers dreg:8;
u8 dreg;
};

struct nft_payload_set {
enum nft_payload_bases base:8;
u8 offset;
u8 len;
enum nft_registers sreg:8;
u8 sreg;
u8 csum_type;
u8 csum_offset;
u8 csum_flags;
Expand Down
2 changes: 1 addition & 1 deletion include/net/netfilter/nft_fib.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <net/netfilter/nf_tables.h>

struct nft_fib {
enum nft_registers dreg:8;
u8 dreg;
u8 result;
u32 flags;
};
Expand Down
4 changes: 2 additions & 2 deletions include/net/netfilter/nft_meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
struct nft_meta {
enum nft_meta_keys key:8;
union {
enum nft_registers dreg:8;
enum nft_registers sreg:8;
u8 dreg;
u8 sreg;
};
};

Expand Down
5 changes: 2 additions & 3 deletions net/bridge/netfilter/nft_meta_bridge.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,8 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx,
return nft_meta_get_init(ctx, expr, tb);
}

priv->dreg = nft_parse_register(tb[NFTA_META_DREG]);
return nft_validate_register_store(ctx, priv->dreg, NULL,
NFT_DATA_VALUE, len);
return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg,
NULL, NFT_DATA_VALUE, len);
}

static struct nft_expr_type nft_meta_bridge_type;
Expand Down
18 changes: 9 additions & 9 deletions net/ipv4/netfilter/nft_dup_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
#include <net/netfilter/ipv4/nf_dup_ipv4.h>

struct nft_dup_ipv4 {
enum nft_registers sreg_addr:8;
enum nft_registers sreg_dev:8;
u8 sreg_addr;
u8 sreg_dev;
};

static void nft_dup_ipv4_eval(const struct nft_expr *expr,
Expand All @@ -40,16 +40,16 @@ static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
if (tb[NFTA_DUP_SREG_ADDR] == NULL)
return -EINVAL;

priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]);
err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in_addr));
err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr,
sizeof(struct in_addr));
if (err < 0)
return err;

if (tb[NFTA_DUP_SREG_DEV] != NULL) {
priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
return nft_validate_register_load(priv->sreg_dev, sizeof(int));
}
return 0;
if (tb[NFTA_DUP_SREG_DEV])
err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV],
&priv->sreg_dev, sizeof(int));

return err;
}

static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr)
Expand Down
18 changes: 9 additions & 9 deletions net/ipv6/netfilter/nft_dup_ipv6.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
#include <net/netfilter/ipv6/nf_dup_ipv6.h>

struct nft_dup_ipv6 {
enum nft_registers sreg_addr:8;
enum nft_registers sreg_dev:8;
u8 sreg_addr;
u8 sreg_dev;
};

static void nft_dup_ipv6_eval(const struct nft_expr *expr,
Expand All @@ -38,16 +38,16 @@ static int nft_dup_ipv6_init(const struct nft_ctx *ctx,
if (tb[NFTA_DUP_SREG_ADDR] == NULL)
return -EINVAL;

priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]);
err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in6_addr));
err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr,
sizeof(struct in6_addr));
if (err < 0)
return err;

if (tb[NFTA_DUP_SREG_DEV] != NULL) {
priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
return nft_validate_register_load(priv->sreg_dev, sizeof(int));
}
return 0;
if (tb[NFTA_DUP_SREG_DEV])
err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV],
&priv->sreg_dev, sizeof(int));

return err;
}

static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr)
Expand Down
11 changes: 11 additions & 0 deletions net/netfilter/ipvs/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,17 @@ config IP_VS_NQ
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.

config IP_VS_TWOS
tristate "weighted random twos choice least-connection scheduling"
help
The weighted random twos choice least-connection scheduling
algorithm picks two random real servers and directs network
connections to the server with the least active connections
normalized by the server weight.

If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.

comment 'IPVS SH scheduler'

config IP_VS_SH_TAB_BITS
Expand Down
1 change: 1 addition & 0 deletions net/netfilter/ipvs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o
obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
obj-$(CONFIG_IP_VS_TWOS) += ip_vs_twos.o

# IPVS application helpers
obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
Expand Down
139 changes: 139 additions & 0 deletions net/netfilter/ipvs/ip_vs_twos.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* IPVS: Power of Twos Choice Scheduling module
*
* Authors: Darby Payne <darby.payne@applovin.com>
*/

#define KMSG_COMPONENT "IPVS"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/random.h>

#include <net/ip_vs.h>

/* Power of Twos Choice scheduling, algorithm originally described by
* Michael Mitzenmacher.
*
* Randomly picks two destinations and picks the one with the least
* amount of connections
*
* The algorithm calculates a few variables
* - total_weight = sum of all weights
* - rweight1 = random number between [0,total_weight]
* - rweight2 = random number between [0,total_weight]
*
* For each destination
* decrement rweight1 and rweight2 by the destination weight
* pick choice1 when rweight1 is <= 0
* pick choice2 when rweight2 is <= 0
*
* Return choice2 if choice2 has less connections than choice 1 normalized
* by weight
*
* References
* ----------
*
* [Mitzenmacher 2016]
* The Power of Two Random Choices: A Survey of Techniques and Results
* Michael Mitzenmacher, Andrea W. Richa y, Ramesh Sitaraman
* http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/twosurvey.pdf
*
*/
static struct ip_vs_dest *ip_vs_twos_schedule(struct ip_vs_service *svc,
const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *choice1 = NULL, *choice2 = NULL;
int rweight1, rweight2, weight1 = -1, weight2 = -1, overhead1 = 0;
int overhead2, total_weight = 0, weight;

IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);

/* Generate a random weight between [0,sum of all weights) */
list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD)) {
weight = atomic_read(&dest->weight);
if (weight > 0) {
total_weight += weight;
choice1 = dest;
}
}
}

if (!choice1) {
ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}

/* Add 1 to total_weight so that the random weights are inclusive
* from 0 to total_weight
*/
total_weight += 1;
rweight1 = prandom_u32() % total_weight;
rweight2 = prandom_u32() % total_weight;

/* Pick two weighted servers */
list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;

weight = atomic_read(&dest->weight);
if (weight <= 0)
continue;

rweight1 -= weight;
rweight2 -= weight;

if (rweight1 <= 0 && weight1 == -1) {
choice1 = dest;
weight1 = weight;
overhead1 = ip_vs_dest_conn_overhead(dest);
}

if (rweight2 <= 0 && weight2 == -1) {
choice2 = dest;
weight2 = weight;
overhead2 = ip_vs_dest_conn_overhead(dest);
}

if (weight1 != -1 && weight2 != -1)
goto nextstage;
}

nextstage:
if (choice2 && (weight2 * overhead1) > (weight1 * overhead2))
choice1 = choice2;

IP_VS_DBG_BUF(6, "twos: server %s:%u conns %d refcnt %d weight %d\n",
IP_VS_DBG_ADDR(choice1->af, &choice1->addr),
ntohs(choice1->port), atomic_read(&choice1->activeconns),
refcount_read(&choice1->refcnt),
atomic_read(&choice1->weight));

return choice1;
}

static struct ip_vs_scheduler ip_vs_twos_scheduler = {
.name = "twos",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_twos_scheduler.n_list),
.schedule = ip_vs_twos_schedule,
};

static int __init ip_vs_twos_init(void)
{
return register_ip_vs_scheduler(&ip_vs_twos_scheduler);
}

static void __exit ip_vs_twos_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_twos_scheduler);
synchronize_rcu();
}

module_init(ip_vs_twos_init);
module_exit(ip_vs_twos_cleanup);
MODULE_LICENSE("GPL");
7 changes: 0 additions & 7 deletions net/netfilter/nf_conntrack_netlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -2686,12 +2686,6 @@ ctnetlink_glue_build_size(const struct nf_conn *ct)
;
}

static struct nf_conn *ctnetlink_glue_get_ct(const struct sk_buff *skb,
enum ip_conntrack_info *ctinfo)
{
return nf_ct_get(skb, ctinfo);
}

static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
{
const struct nf_conntrack_zone *zone;
Expand Down Expand Up @@ -2925,7 +2919,6 @@ static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct,
}

static struct nfnl_ct_hook ctnetlink_glue_hook = {
.get_ct = ctnetlink_glue_get_ct,
.build_size = ctnetlink_glue_build_size,
.build = ctnetlink_glue_build,
.parse = ctnetlink_glue_parse,
Expand Down
6 changes: 3 additions & 3 deletions net/netfilter/nf_flow_table_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -191,14 +191,14 @@ static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
{
const struct flow_offload_tuple *tuple = data;

return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed);
}

static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
{
const struct flow_offload_tuple_rhash *tuplehash = data;

return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed);
}

static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
Expand All @@ -207,7 +207,7 @@ static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
const struct flow_offload_tuple *tuple = arg->key;
const struct flow_offload_tuple_rhash *x = ptr;

if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash)))
return 1;

return 0;
Expand Down
Loading

0 comments on commit c273a20

Please sign in to comment.