/* Connections' size value needed by ip_vs_ctl.c */ externint ip_vs_conn_tab_size;
externstruct mutex __ip_vs_mutex;
struct ip_vs_iphdr { int hdr_flags; /* ipvs flags */
__u32 off; /* Where IP or IPv4 header starts */
__u32 len; /* IPv4 simply where L4 starts
* IPv6 where L4 Transport Header starts */
__u16 fragoffs; /* IPv6 fragment offset, 0 if first frag (or not frag)*/
__s16 protocol;
__s32 flags; union nf_inet_addr saddr; union nf_inet_addr daddr;
};
staticinlinevoid *frag_safe_skb_hp(conststruct sk_buff *skb, int offset, int len, void *buffer)
{ return skb_header_pointer(skb, offset, len, buffer);
}
/* This function handles filling *ip_vs_iphdr, both for IPv4 and IPv6. * IPv6 requires some extra work, as finding proper header position, * depend on the IPv6 extension headers.
*/ staticinlineint
ip_vs_fill_iph_skb_off(int af, conststruct sk_buff *skb, int offset, int hdr_flags, struct ip_vs_iphdr *iphdr)
{
iphdr->hdr_flags = hdr_flags;
iphdr->off = offset;
#define IP_VS_DBG_BUF(level, msg, ...) \ do { \ char ip_vs_dbg_buf[160]; \ int ip_vs_dbg_idx = 0; \ if (level <= ip_vs_get_debug_level()) \
printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \
} while (0) #define IP_VS_ERR_BUF(msg...) \ do { \ char ip_vs_dbg_buf[160]; \ int ip_vs_dbg_idx = 0; \
pr_err(msg); \
} while (0)
/* Only use from within IP_VS_DBG_BUF() or IP_VS_ERR_BUF macros */ #define IP_VS_DBG_ADDR(af, addr) \
ip_vs_dbg_addr(af, ip_vs_dbg_buf, \ sizeof(ip_vs_dbg_buf), addr, \
&ip_vs_dbg_idx)
#define IP_VS_DBG(level, msg, ...) \ do { \ if (level <= ip_vs_get_debug_level()) \
printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \
} while (0) #define IP_VS_DBG_RL(msg, ...) \ do { \ if (net_ratelimit()) \
printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \
} while (0) #define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg) \ do { \ if (level <= ip_vs_get_debug_level()) \
pp->debug_packet(af, pp, skb, ofs, msg); \
} while (0) #define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) \ do { \ if (level <= ip_vs_get_debug_level() && \
net_ratelimit()) \
pp->debug_packet(af, pp, skb, ofs, msg); \
} while (0) #else/* NO DEBUGGING at ALL */ #define IP_VS_DBG_BUF(level, msg...) do {} while (0) #define IP_VS_ERR_BUF(msg...) do {} while (0) #define IP_VS_DBG(level, msg...) do {} while (0) #define IP_VS_DBG_RL(msg...) do {} while (0) #define IP_VS_DBG_PKT(level, af, pp, skb, ofs, msg) do {} while (0) #define IP_VS_DBG_RL_PKT(level, af, pp, skb, ofs, msg) do {} while (0) #endif
#define IP_VS_BUG() BUG() #define IP_VS_ERR_RL(msg, ...) \ do { \ if (net_ratelimit()) \
pr_err(msg, ##__VA_ARGS__); \
} while (0)
/* The port number of FTP service (in network order). */ #define FTPPORT cpu_to_be16(21) #define FTPDATA cpu_to_be16(20)
/* Connection templates use bits from state */ #define IP_VS_CTPL_S_NONE 0x0000 #define IP_VS_CTPL_S_ASSURED 0x0001 #define IP_VS_CTPL_S_LAST 0x0002
/* Delta sequence info structure * Each ip_vs_conn has 2 (output AND input seq. changes). * Only used in the VS/NAT.
*/ struct ip_vs_seq {
__u32 init_seq; /* Add delta from this seq */
__u32 delta; /* Delta in sequence numbers */
__u32 previous_delta; /* Delta in sequence numbers
* before last resized pkt */
};
u64 cps; /* current connection rate */
u64 inpps; /* current in packet rate */
u64 outpps; /* current out packet rate */
u64 inbps; /* current in byte rate */
u64 outbps; /* current out byte rate */
};
/* Process estimators in multiple timer ticks (20/50/100, see ktrow) */ #define IPVS_EST_NTICKS 50 /* Estimation uses a 2-second period containing ticks (in jiffies) */ #define IPVS_EST_TICK ((2 * HZ) / IPVS_EST_NTICKS)
/* Limit of CPU load per kthread (8 for 12.5%), ratio of CPU capacity (1/C). * Value of 4 and above ensures kthreads will take work without exceeding * the CPU capacity under different circumstances.
*/ #define IPVS_EST_LOAD_DIVISOR 8
/* Kthreads should not have work that exceeds the CPU load above 50% */ #define IPVS_EST_CPU_KTHREADS (IPVS_EST_LOAD_DIVISOR / 2)
/* Desired number of chains per timer tick (chain load factor in 100us units), * 48=4.8ms of 40ms tick (12% CPU usage): * 2 sec * 1000 ms in sec * 10 (100us in ms) / 8 (12.5%) / 50
*/ #define IPVS_EST_CHAIN_FACTOR \
ALIGN_DOWN(2 * 1000 * 10 / IPVS_EST_LOAD_DIVISOR / IPVS_EST_NTICKS, 8)
/* Compiled number of chains per tick * The defines should match cond_resched_rcu
*/ #ifdefined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) #define IPVS_EST_TICK_CHAINS IPVS_EST_CHAIN_FACTOR #else #define IPVS_EST_TICK_CHAINS 1 #endif
#if IPVS_EST_NTICKS > 127 #error Too many timer ticks for ktrow #endif
/* Multiple chains processed in same tick */ struct ip_vs_est_tick_data { struct rcu_head rcu_head; struct hlist_head chains[IPVS_EST_TICK_CHAINS];
DECLARE_BITMAP(present, IPVS_EST_TICK_CHAINS);
DECLARE_BITMAP(full, IPVS_EST_TICK_CHAINS); int chain_len[IPVS_EST_TICK_CHAINS];
};
/* Context for estimation kthread */ struct ip_vs_est_kt_data { struct netns_ipvs *ipvs; struct task_struct *task; /* task if running */ struct ip_vs_est_tick_data __rcu *ticks[IPVS_EST_NTICKS];
DECLARE_BITMAP(avail, IPVS_EST_NTICKS); /* tick has space for ests */ unsignedlong est_timer; /* estimation timer (jiffies) */ struct ip_vs_stats *calc_stats; /* Used for calculation */ int tick_len[IPVS_EST_NTICKS]; /* est count */ int id; /* ktid per netns */ int chain_max; /* max ests per tick chain */ int tick_max; /* max ests per tick */ int est_count; /* attached ests to kthread */ int est_max_count; /* max ests per kthread */ int add_row; /* row for new ests */ int est_row; /* estimated row */
};
int (*conn_schedule)(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph);
struct ip_vs_conn *
(*conn_in_get)(struct netns_ipvs *ipvs, int af, conststruct sk_buff *skb, conststruct ip_vs_iphdr *iph);
struct ip_vs_conn *
(*conn_out_get)(struct netns_ipvs *ipvs, int af, conststruct sk_buff *skb, conststruct ip_vs_iphdr *iph);
/* Flags and state transition */
spinlock_t lock; /* lock for state transition */ volatile __u16 state; /* state info */ volatile __u16 old_state; /* old state, to be used for * state transition triggered * synchronization
*/
__u32 fwmark; /* Fire wall mark from skb */ unsignedlong sync_endtime; /* jiffies + sent_retries */
/* Control members */ struct ip_vs_conn *control; /* Master control connection */
atomic_t n_control; /* Number of controlled ones */ struct ip_vs_dest *dest; /* real server */
atomic_t in_pkts; /* incoming packet counter */
/* Packet transmitter for different forwarding methods. If it * mangles the packet, it must return NF_DROP or better NF_STOLEN, * otherwise this must be changed to a sk_buff **. * NF_ACCEPT can be returned when destination is local.
*/ int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
/* Note: we can group the following members into a structure, * in order to save more space, and the following members are * only used in VS/NAT anyway
*/ struct ip_vs_app *app; /* bound ip_vs_app object */ void *app_data; /* Application private data */
struct_group(sync_conn_opt, struct ip_vs_seq in_seq; /* incoming seq. struct */ struct ip_vs_seq out_seq; /* outgoing seq. struct */
);
/* Extended internal versions of struct ip_vs_service_user and ip_vs_dest_user * for IPv6 support. * * We need these to conveniently pass around service and destination * options, but unfortunately, we also need to keep the old definitions to * maintain userspace backwards compatibility for the setsockopt interface.
*/ struct ip_vs_service_user_kern { /* virtual service addresses */
u16 af;
u16 protocol; union nf_inet_addr addr; /* virtual ip address */
__be16 port;
u32 fwmark; /* firewall mark of service */
/* virtual service options */ char *sched_name; char *pe_name; unsignedint flags; /* virtual service flags */ unsignedint timeout; /* persistent timeout in sec */
__be32 netmask; /* persistent netmask or plen */
};
struct ip_vs_dest_user_kern { /* destination server address */ union nf_inet_addr addr;
__be16 port;
/* real server options */ unsignedint conn_flags; /* connection flags */ int weight; /* destination weight */
/* thresholds for active connections */
u32 u_threshold; /* upper threshold */
u32 l_threshold; /* lower threshold */
/* Address family of addr */
u16 af;
u16 tun_type; /* tunnel type */
__be16 tun_port; /* tunnel port */
u16 tun_flags; /* tunnel flags */
};
/* * The information about the virtual service offered to the net and the * forwarding entries.
*/ struct ip_vs_service { struct hlist_node s_list; /* for normal service table */ struct hlist_node f_list; /* for fwmark-based service table */
atomic_t refcnt; /* reference counter */
u16 af; /* address family */
__u16 protocol; /* which protocol (TCP/UDP) */ union nf_inet_addr addr; /* IP address for virtual service */
__be16 port; /* port number for the service */
__u32 fwmark; /* firewall mark of the service */ unsignedint flags; /* service status flags */ unsignedint timeout; /* persistent timeout in ticks */
__be32 netmask; /* grouping granularity, mask/plen */ struct netns_ipvs *ipvs;
struct list_head destinations; /* real server d-linked list */
__u32 num_dests; /* number of servers */ struct ip_vs_stats stats; /* statistics for the service */
/* Information for cached dst */ struct ip_vs_dest_dst { struct dst_entry *dst_cache; /* destination cache entry */
u32 dst_cookie; union nf_inet_addr dst_saddr; struct rcu_head rcu_head;
};
/* The real server destination forwarding entry with ip address, port number, * and so on.
*/ struct ip_vs_dest { struct list_head n_list; /* for the dests in the service */ struct hlist_node d_list; /* for table with all the dests */
u16 af; /* address family */
__be16 port; /* port number of the server */ union nf_inet_addr addr; /* IP address of the server */ volatileunsignedint flags; /* dest status flags */
atomic_t conn_flags; /* flags to copy to conn */
atomic_t weight; /* server weight */
atomic_t last_weight; /* server latest weight */
__u16 tun_type; /* tunnel type */
__be16 tun_port; /* tunnel port */
__u16 tun_flags; /* tunnel flags */
/* for destination cache */
spinlock_t dst_lock; /* lock of dst_cache */ struct ip_vs_dest_dst __rcu *dest_dst; /* cached dst info */
/* for virtual service */ struct ip_vs_service __rcu *svc; /* service it belongs to */
__u16 protocol; /* which protocol (TCP/UDP) */
__be16 vport; /* virtual port number */ union nf_inet_addr vaddr; /* virtual IP address */
__u32 vfwmark; /* firewall mark of service */
struct rcu_head rcu_head; struct list_head t_list; /* in dest_trash */ unsignedint in_rs_table:1; /* we are in rs_table */
};
/* The scheduler object */ struct ip_vs_scheduler { struct list_head n_list; /* d-linked list head */ char *name; /* scheduler name */
atomic_t refcnt; /* reference counter */ struct module *module; /* THIS_MODULE/NULL */
/* scheduler initializing service */ int (*init_service)(struct ip_vs_service *svc); /* scheduling service finish */ void (*done_service)(struct ip_vs_service *svc); /* dest is linked */ int (*add_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest); /* dest is unlinked */ int (*del_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest); /* dest is updated */ int (*upd_dest)(struct ip_vs_service *svc, struct ip_vs_dest *dest);
/* selecting a server from the given service */ struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc, conststruct sk_buff *skb, struct ip_vs_iphdr *iph);
};
/* The persistence engine object */ struct ip_vs_pe { struct list_head n_list; /* d-linked list head */ char *name; /* scheduler name */
atomic_t refcnt; /* reference counter */ struct module *module; /* THIS_MODULE/NULL */
/* get the connection template, if any */ int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb); bool (*ct_match)(conststruct ip_vs_conn_param *p, struct ip_vs_conn *ct);
u32 (*hashkey_raw)(conststruct ip_vs_conn_param *p, u32 initval, bool inverse); int (*show_pe_data)(conststruct ip_vs_conn *cp, char *buf); /* create connections for real-server outgoing packets */ struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct sk_buff *skb, conststruct ip_vs_iphdr *iph,
__be16 dport, __be16 cport);
};
/* The application module object (a.k.a. app incarnation) */ struct ip_vs_app { struct list_head a_list; /* member in app list */ int type; /* IP_VS_APP_TYPE_xxx */ char *name; /* application module name */
__u16 protocol; struct module *module; /* THIS_MODULE/NULL */ struct list_head incs_list; /* list of incarnations */
/* members for application incarnations */ struct list_head p_list; /* member in proto app list */ struct ip_vs_app *app; /* its real application */
__be16 port; /* port number in net order */
atomic_t usecnt; /* usage counter */ struct rcu_head rcu_head;
/* output hook: Process packet in inout direction, diff set for TCP. * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, * 2=Mangled but checksum was not updated
*/ int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh);
/* input hook: Process packet in outin direction, diff set for TCP. * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, * 2=Mangled but checksum was not updated
*/ int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *, struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh);
#ifdef CONFIG_SYSCTL /* delayed work for expiring no dest connections */ struct delayed_work expire_nodest_conn_work; /* 1/rate drop and drop-entry variables */ struct delayed_work defense_work; /* Work handler */ int drop_rate; int drop_counter; int old_secure_tcp;
atomic_t dropentry; /* locks in ctl.c */
spinlock_t dropentry_lock; /* drop entry handling */
spinlock_t droppacket_lock; /* drop packet handling */
spinlock_t securetcp_lock; /* state and timeout tables */
/* sysctl variables */ int sysctl_amemthresh; int sysctl_am_droprate; int sysctl_drop_entry; int sysctl_drop_packet; int sysctl_secure_tcp; #ifdef CONFIG_IP_VS_NFCT int sysctl_conntrack; #endif int sysctl_snat_reroute; int sysctl_sync_ver; int sysctl_sync_ports; int sysctl_sync_persist_mode; unsignedlong sysctl_sync_qlen_max; int sysctl_sync_sock_size; int sysctl_cache_bypass; int sysctl_expire_nodest_conn; int sysctl_sloppy_tcp; int sysctl_sloppy_sctp; int sysctl_expire_quiescent_template; int sysctl_sync_threshold[2]; unsignedint sysctl_sync_refresh_period; int sysctl_sync_retries; int sysctl_nat_icmp_send; int sysctl_pmtu_disc; int sysctl_backup_only; int sysctl_conn_reuse_mode; int sysctl_schedule_icmp; int sysctl_ignore_tunneled; int sysctl_run_estimation; #ifdef CONFIG_SYSCTL
cpumask_var_t sysctl_est_cpulist; /* kthread cpumask */ int est_cpulist_valid; /* cpulist set */ int sysctl_est_nice; /* kthread nice */ int est_stopped; /* stop tasks */ #endif
/* ip_vs_lblc */ int sysctl_lblc_expiration; struct ctl_table_header *lblc_ctl_header; struct ctl_table *lblc_ctl_table; /* ip_vs_lblcr */ int sysctl_lblcr_expiration; struct ctl_table_header *lblcr_ctl_header; struct ctl_table *lblcr_ctl_table; /* ip_vs_est */ struct delayed_work est_reload_work;/* Reload kthread tasks */ struct mutex est_mutex; /* protect kthread tasks */ struct hlist_head est_temp_list; /* Ests during calc phase */ struct ip_vs_est_kt_data **est_kt_arr; /* Array of kthread data ptrs */ unsignedlong est_max_threads;/* Hard limit of kthreads */ int est_calc_phase; /* Calculation phase */ int est_chain_max; /* Calculated chain_max */ int est_kt_count; /* Allocated ptrs */ int est_add_ktid; /* ktid where to add ests */
atomic_t est_genid; /* kthreads reload genid */
atomic_t est_genid_done; /* applied genid */ /* ip_vs_sync */
spinlock_t sync_lock; struct ipvs_master_sync_state *ms;
spinlock_t sync_buff_lock; struct ip_vs_sync_thread_data *master_tinfo; struct ip_vs_sync_thread_data *backup_tinfo; int threads_mask; volatileint sync_state; struct mutex sync_mutex; struct ipvs_sync_daemon_cfg mcfg; /* Master Configuration */ struct ipvs_sync_daemon_cfg bcfg; /* Backup Configuration */ /* net name space ptr */ struct net *net; /* Needed by timer routines */ /* Number of heterogeneous destinations, needed because heterogeneous * are not supported when synchronization is enabled.
*/ unsignedint mixed_address_family_dests; unsignedint hooks_afmask; /* &1=AF_INET, &2=AF_INET6 */
};
struct ip_vs_conn * ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, conststruct sk_buff *skb, conststruct ip_vs_iphdr *iph);
/* Get reference to gain full access to conn. * By default, RCU read-side critical sections have access only to * conn fields and its PE data, see ip_vs_conn_rcu_free() for reference.
*/ staticinlinebool __ip_vs_conn_get(struct ip_vs_conn *cp)
{ return refcount_inc_not_zero(&cp->refcnt);
}
/* put back the conn without restarting its timer */ staticinlinevoid __ip_vs_conn_put(struct ip_vs_conn *cp)
{
smp_mb__before_atomic();
refcount_dec(&cp->refcnt);
} void ip_vs_conn_put(struct ip_vs_conn *cp); void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport);
cp->control = NULL; if (atomic_read(&ctl_cp->n_control) == 0) {
IP_VS_ERR_BUF("BUG control DEL with n=0 : " "%s:%d to %s:%d\n",
IP_VS_DBG_ADDR(cp->af, &cp->caddr),
ntohs(cp->cport),
IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
ntohs(cp->vport));
return;
}
atomic_dec(&ctl_cp->n_control);
}
staticinlinevoid
ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp)
{ if (cp->control) {
IP_VS_ERR_BUF("request control ADD for already controlled: " "%s:%d to %s:%d\n",
IP_VS_DBG_ADDR(cp->af, &cp->caddr),
ntohs(cp->cport),
IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
ntohs(cp->vport));
staticinlinevoid ip_vs_dest_put_and_free(struct ip_vs_dest *dest)
{ if (refcount_dec_and_test(&dest->refcnt))
kfree(dest);
}
/* IPVS sync daemon data and function prototypes * (from ip_vs_sync.c)
*/ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *cfg, int state); int stop_sync_thread(struct netns_ipvs *ipvs, int state); void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts);
staticinlinevoid ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs)
{ #ifdef CONFIG_SYSCTL /* Stop tasks while cpulist is empty or if disabled with flag */
ipvs->est_stopped = !sysctl_run_estimation(ipvs) ||
(ipvs->est_cpulist_valid &&
cpumask_empty(sysctl_est_cpulist(ipvs))); #endif
}
#ifdef CONFIG_SYSCTL /* This is a simple mechanism to ignore packets when * we are loaded. Just set ip_vs_drop_rate to 'n' and * we start to drop 1/rate of the packets
*/ staticinlineint ip_vs_todrop(struct netns_ipvs *ipvs)
{ if (!ipvs->drop_rate) return 0; if (--ipvs->drop_counter > 0) return 0;
ipvs->drop_counter = ipvs->drop_rate; return 1;
} #else staticinlineint ip_vs_todrop(struct netns_ipvs *ipvs) { return 0; } #endif
#ifdef CONFIG_SYSCTL /* Enqueue delayed work for expiring no dest connections * Only run when sysctl_expire_nodest=1
*/ staticinlinevoid ip_vs_enqueue_expire_nodest_conns(struct netns_ipvs *ipvs)
{ if (sysctl_expire_nodest_conn(ipvs))
queue_delayed_work(system_long_wq,
&ipvs->expire_nodest_conn_work, 1);
}
/* Using old conntrack that can not be redirected to another real server? */ staticinlinebool ip_vs_conn_uses_old_conntrack(struct ip_vs_conn *cp, struct sk_buff *skb)
{ #ifdef CONFIG_IP_VS_NFCT enum ip_conntrack_info ctinfo; struct nf_conn *ct;
staticinlineint
ip_vs_dest_conn_overhead(struct ip_vs_dest *dest)
{ /* We think the overhead of processing active connections is 256 * times higher than that of inactive connections in average. (This * 256 times might not be accurate, we will change it later) We * use the following formula to estimate the overhead now: * dest->activeconns*256 + dest->inactconns
*/ return (atomic_read(&dest->activeconns) << 8) +
atomic_read(&dest->inactconns);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.