diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 29267da5fe9f0..e774ee89b5341 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -199,6 +199,15 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) struct tcp_md5sig_key; +/*phuc*/ +typedef enum { + RIGHT_RATIO_SET, + LEFT_RATIO_SET, + SEARCH_RATE, + RIGHT_RATIO_FINE, + LEFT_RATIO_FINE +}ratio_search_state; + struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; @@ -273,6 +282,9 @@ struct tcp_sock { u32 end_seq; /* Ending TCP sequence of the skb */ u32 last_delivered; /* tp->delivered at last reo_wnd adj */ u8 reo_wnd_steps; /* Allowed reordering window */ + /*Phuc*/ + u8 reord; /* reordering detected */ + /****/ #define TCP_RACK_RECOVERY_THRESH 16 u8 reo_wnd_persist:5, /* No. of recovery since last adj */ dsack_seen:1, /* Whether DSACK seen after last adj */ @@ -494,6 +506,31 @@ struct tcp_sock { char mptcp_sched_name[MPTCP_SCHED_NAME_MAX]; char mptcp_pm_name[MPTCP_PM_NAME_MAX]; #endif /* CONFIG_MPTCP */ +/* shivanga: Ratio Scheduler Variables */ + u32 last_probe_tstamp; + u32 in_probe; + u32 last_ac_rate; + u32 last_ratio; + u64 prev_tx_bytes; + u64 prev_tstamp; + u32 rate_est_val; + u32 rate_est_cnt; + u32 last_rate_search_start[5]; /* 5 because that's the search trigger threshold */ + u32 init_buffer_size[2]; + u32 last_buffer_size[2]; + u8 buffer_threshold_cnt; + s32 buffer_trigger_threshold; + ratio_search_state search_state; + //u8 buf_size_acc; + struct sock *prev_sk; + unsigned int num_segments_flow_one; + int ratio_search_step; + unsigned int ratio_rate_sample; + bool run_started; + bool init_search; +/*Phuc: Ratio scheduler optimization*/ + u16 head_length; + u32 num_acks_head[2]; }; enum tsq_enum { @@ -536,7 +573,7 @@ struct tcp_timewait_sock { /* The time we sent the last out-of-window ACK: */ u32 tw_last_oow_ack_time; - int tw_ts_recent_stamp; + long tw_ts_recent_stamp; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 2d0134da998aa..145989c734c41 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -157,6 +157,7 @@ struct mptcp_options_received { struct mptcp_tcp_sock { struct hlist_node node; + struct tcp_sock *next; /* Next subflow socket */ struct hlist_node cb_list; struct mptcp_options_received rx_opt; @@ -166,6 +167,7 @@ struct mptcp_tcp_sock { u16 map_data_len; u16 slave_sk:1, fully_established:1, + establish_increased:1, second_packet:1, attached:1, send_mp_fail:1, @@ -187,7 +189,7 @@ struct mptcp_tcp_sock { u8 rem_id; u8 sk_err; -#define MPTCP_SCHED_SIZE 16 +#define MPTCP_SCHED_SIZE 64 u8 mptcp_sched[MPTCP_SCHED_SIZE] __aligned(8); int init_rcv_wnd; @@ -261,6 +263,8 @@ struct mptcp_sched_ops { struct mptcp_cb { /* list of sockets in this multipath connection */ struct hlist_head conn_list; + /*phuc*/ + struct tcp_sock *connection_list; /* list of sockets that need a call to release_cb */ struct hlist_head callback_list; @@ -285,9 +289,13 @@ struct mptcp_cb { rcv_hiseq_index:1, /* Index in rcv_high_order of rcv_nxt */ tcp_ca_explicit_set:1; /* was meta CC set by app? */ + /* socket count in this connection */ + u8 cnt_subflows; + u8 cnt_established; #define MPTCP_SCHED_DATA_SIZE 8 u8 mptcp_sched[MPTCP_SCHED_DATA_SIZE] __aligned(8); - const struct mptcp_sched_ops *sched_ops; + //const struct mptcp_sched_ops *sched_ops; + struct mptcp_sched_ops *sched_ops; struct sk_buff_head reinject_queue; /* First cache-line boundary is here minus 8 bytes. But from the @@ -343,6 +351,11 @@ struct mptcp_cb { u32 orig_window_clamp; struct tcp_info *master_info; + + /* swetank */ + u32 cnt_in_order; + u32 cnt_out_of_order; + /* end:swetankk */ }; #define MPTCP_VERSION_0 0 @@ -459,6 +472,18 @@ extern bool mptcp_init_failed; #define MPTCPHDR_INF 0x08 #define MPTCP_REINJECT 0x10 /* Did we reinject this segment? */ +/* swetankk */ +#define MPTCP_SCHED_PROBE +#ifdef CONFIG_MPTCP_QUEUE_PROBE + #define MPTCP_QUEUE_PROBE +#endif + +#ifdef MPTCP_QUEUE_PROBE + #define MPTCP_RCV_QUEUE 0x00 + #define MPTCP_OFO_QUEUE 0x01 +#endif +/* end: swetankk */ + struct mptcp_option { __u8 kind; __u8 len; @@ -647,6 +672,35 @@ struct mp_prio { __u8 addr_id; } __attribute__((__packed__)); +/* swetankk */ +#ifdef MPTCP_SCHED_PROBE +struct mptcp_sched_probe { + unsigned long id; + struct sock *sk; + bool selector_reject; + bool found_unused_reject; + bool def_unavailable; + bool temp_unavailable; + bool srtt_reject; + bool selected; + int split; + int skblen; + u32 tx_bytes; + u32 trans_start; +}; +#endif +#ifdef MPTCP_QUEUE_PROBE +struct mptcp_queue_probe { + u8 q_id; + struct tcp_sock *meta_tp; + u32 skb_seq; + u32 skb_end_seq; + u8 op_id; + u32 q_size; +}; +#endif +/* end:swetankk */ + static inline int mptcp_sub_len_dss(const struct mp_dss *m, const int csum) { return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2); @@ -659,6 +713,22 @@ extern int sysctl_mptcp_version; extern int sysctl_mptcp_checksum; extern int sysctl_mptcp_debug; extern int sysctl_mptcp_syn_retries; +/* swetankk */ +extern int sysctl_mptcp_scheduler_optimizations_disabled; +extern int sysctl_mptcp_set_backup; +/* end: swetankk */ +/* shivanga */ +extern int sysctl_num_segments_flow_one; +extern int sysctl_mptcp_rate_sample; +extern int sysctl_mptcp_ratio_trigger_search; +extern int sysctl_mptcp_ratio_search_step; +extern int sysctl_mptcp_trigger_threshold; +extern int sysctl_mptcp_probe_interval_secs; +extern int sysctl_mptcp_ratio_static; + +#define MPTCP_BYTES_NOT_SENT_MAX 165 +//#define MPTCP_RATE_MAX 50 +/* end: shivanga */ extern struct workqueue_struct *mptcp_wq; @@ -668,6 +738,7 @@ extern struct workqueue_struct *mptcp_wq; pr_err(fmt, ##args); \ } while (0) +/* Iterates over all subflows */ static inline struct sock *mptcp_to_sock(const struct mptcp_tcp_sock *mptcp) { return (struct sock *)mptcp->tp; @@ -676,7 +747,6 @@ static inline struct sock *mptcp_to_sock(const struct mptcp_tcp_sock *mptcp) #define mptcp_for_each_sub(__mpcb, __mptcp) \ hlist_for_each_entry_rcu(__mptcp, &((__mpcb)->conn_list), node) -/* Must be called with the appropriate lock held */ #define mptcp_for_each_sub_safe(__mpcb, __mptcp, __tmp) \ hlist_for_each_entry_safe(__mptcp, __tmp, &((__mpcb)->conn_list), node) @@ -903,6 +973,15 @@ bool subflow_is_backup(const struct tcp_sock *tp); struct sock *get_available_subflow(struct sock *meta_sk, struct sk_buff *skb, bool zero_wnd_test); extern struct mptcp_sched_ops mptcp_sched_default; +/* swetankk */ +#ifdef MPTCP_SCHED_PROBE +extern void mptcp_sched_probe_init(struct mptcp_sched_probe *sprobe); +extern struct mptcp_sched_probe* mptcp_sched_probe_log_hook(struct mptcp_sched_probe* sprobe, bool selected, unsigned long sched_probe_id, struct sock *sk); +#endif +#ifdef MPTCP_QUEUE_PROBE +extern struct mptcp_queue_probe* mptcp_queue_probe_log_hook(u8 q_id, struct tcp_sock *meta_tp, struct sk_buff *skb, u8 op_id); +#endif +/* end: swetankk */ /* Initializes function-pointers and MPTCP-flags */ static inline void mptcp_init_tcp_sock(struct sock *sk) diff --git a/make_mptcp.sh b/make_mptcp.sh new file mode 100755 index 0000000000000..2c6bc56588a38 --- /dev/null +++ b/make_mptcp.sh @@ -0,0 +1,4 @@ +make -j8 +make modules -j8 +make modules_install +make install diff --git a/make_mptcp_ratio_only.sh b/make_mptcp_ratio_only.sh new file mode 100644 index 0000000000000..aa90a4946faa2 --- /dev/null +++ b/make_mptcp_ratio_only.sh @@ -0,0 +1,9 @@ +echo "Unloading mptcp_ratio..." +rmmod mptcp_ratio +echo "Buiding mptcp submodules..." +make -C . M=net/mptcp +echo "Copying mptcp_ratio.ko..." +cp net/mptcp/mptcp_ratio.ko /lib/modules/4.19.224/kernel/net/mptcp/mptcp_ratio.ko +echo "Reloading mptcp_ratio..." +modprobe mptcp_ratio +echo "Done!" diff --git a/net/Kconfig b/net/Kconfig index 274282e9b7426..2fa3f5b33150c 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -340,6 +340,50 @@ config NET_PKTGEN To compile this code as a module, choose M here: the module will be called pktgen. +config NET_TCPPROBE + tristate "TCP connection probing" + depends on INET && PROC_FS && KPROBES + ---help--- + This module allows for capturing the changes to TCP connection + state in response to incoming packets. It is used for debugging + TCP congestion avoidance modules. If you don't understand + what was just said, you don't need it: say N. + + Documentation on how to use TCP connection probing can be found + at: + + http://www.linuxfoundation.org/collaborate/workgroups/networking/tcpprobe + + To compile this code as a module, choose M here: the + module will be called tcp_probe. + +config NET_MPTCP_SCHED_PROBE + tristate "MPTCP scheduler probing" + depends on INET && PROC_FS && KPROBES && (MPTCP=y) + ---help--- + This module allows for probing the MPTCP scheduler decision and state. For + now, only the RTT scheduler is supported. If you don't understand what was + just said, you don't need it: say N. Design heavily borrowed from the famous + tcp_probe module: + + http://www.linuxfoundation.org/collaborate/workgroups/networking/tcpprobe + + To compile this code as a module, choose M here: the module will be called + mptcp_sched_probe. + +config NET_MPTCP_QUEUE_PROBE + tristate "MPTCP meta recv and ofo queue probing" + depends on INET && PROC_FS && KPROBES && (MPTCP=y) && (MPTCP_QUEUE_PROBE=y) + ---help--- + This module allows for probing the MPTCP meta-level receive and out-of-order + queues. If you don't understand what was just said, you don't need it: say N. + Design heavily borrowed from the famous tcp_probe module: + + http://www.linuxfoundation.org/collaborate/workgroups/networking/tcpprobe + + To compile this code as a module, choose M here: the module will be called + mptcp_queue_probe. + config NET_DROP_MONITOR tristate "Network packet drop alerting service" depends on INET && TRACEPOINTS diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index 37f3af3db2a6e..9bb7d9059ad0d 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -109,6 +109,20 @@ config MPTCP_REDUNDANT This scheduler sends all packets redundantly over all subflows to decreases latency and jitter on the cost of lower throughput. +config MPTCP_RTT + tristate "MPTCP RTT" + depends on (MPTCP=y) + ---help--- + This is a round-trip time (RTT) based scheduler (copy of the default one). It sends + packets to the flow with the lower RTT. + +config MPTCP_RATIO + tristate "MPTCP Ratio" + depends on (MPTCP=y) + ---help--- + This is a quota based scheduler that assigns packets to subflows based on given ratio. + It based on the design of the ROUNDROBIN scheduler which gives equal quota to all subflows. + choice prompt "Default MPTCP Scheduler" default DEFAULT_SCHEDULER @@ -133,6 +147,18 @@ choice This is the redundant scheduler, sending packets redundantly over all the subflows. + config DEFAULT_RTT + bool "RTT" if MPTCP_RTT=y + ---help--- + This is a copy of default RTT-based scheduler. The only difference is that + it is built as a seperate module, making it easier to make changes to it. + + config DEFAULT_RATIO + bool "Ratio" if MPTCP_RATIO=y + ---help--- + This is a quota based scheduler that assigns packets to subflows based on given ratio. + It based on the design of the ROUNDROBIN scheduler which gives equal quota to all subflows. + endchoice endif @@ -142,5 +168,12 @@ config DEFAULT_MPTCP_SCHED default "default" if DEFAULT_SCHEDULER default "roundrobin" if DEFAULT_ROUNDROBIN default "redundant" if DEFAULT_REDUNDANT + default "rtt" if DEFAULT_RTT + default "ratio" if DEFAULT_RATIO default "default" +config MPTCP_QUEUE_PROBE + bool "MPTCP Meta-Queues Instrumentation" + depends on (MPTCP=y) + ---help--- + This adds instrumentation in the MPTCP stack to trace meta rcv and ofo queues. diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index 82a2d4d945ae0..10eccf2b375b5 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_pm.o \ - mptcp_output.o mptcp_input.o mptcp_sched.o + mptcp_output.o mptcp_input.o mptcp_sched.o mptcp_sched_probe_log_hook.o obj-$(CONFIG_TCP_CONG_LIA) += mptcp_coupled.o obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o @@ -20,5 +20,9 @@ obj-$(CONFIG_MPTCP_NETLINK) += mptcp_netlink.o obj-$(CONFIG_MPTCP_ROUNDROBIN) += mptcp_rr.o obj-$(CONFIG_MPTCP_REDUNDANT) += mptcp_redundant.o obj-$(CONFIG_MPTCP_BLEST) += mptcp_blest.o +obj-$(CONFIG_MPTCP_RTT) += mptcp_rtt.o +obj-$(CONFIG_MPTCP_RATIO) += mptcp_ratio.o +obj-$(CONFIG_NET_MPTCP_SCHED_PROBE) += mptcp_sched_probe.o +obj-$(CONFIG_NET_MPTCP_QUEUE_PROBE) += mptcp_queue_probe.o mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c index d9b4e30838eaa..7fe59b1275184 100644 --- a/net/mptcp/mptcp_ctrl.c +++ b/net/mptcp/mptcp_ctrl.c @@ -72,6 +72,59 @@ int sysctl_mptcp_debug __read_mostly; EXPORT_SYMBOL(sysctl_mptcp_debug); int sysctl_mptcp_syn_retries __read_mostly = 3; +/* swetankk */ +int sysctl_mptcp_scheduler_optimizations_disabled __read_mostly = 0; +EXPORT_SYMBOL(sysctl_mptcp_scheduler_optimizations_disabled); +/* end: swetankk */ + +/* shivanga */ +int sysctl_num_segments_flow_one __read_mostly = 77; +EXPORT_SYMBOL(sysctl_num_segments_flow_one); + +int sysctl_mptcp_rate_sample __read_mostly = 100; +EXPORT_SYMBOL(sysctl_mptcp_rate_sample); + +int sysctl_mptcp_ratio_static = 0; +EXPORT_SYMBOL(sysctl_mptcp_ratio_static); + +int sysctl_mptcp_ratio_trigger_search = 0; +EXPORT_SYMBOL(sysctl_mptcp_ratio_trigger_search); + +int sysctl_mptcp_ratio_search_step = 5; +EXPORT_SYMBOL(sysctl_mptcp_ratio_search_step); + +int sysctl_mptcp_trigger_threshold = 100000; //Kbps +EXPORT_SYMBOL(sysctl_mptcp_trigger_threshold); + +int sysctl_mptcp_set_backup = 0; +EXPORT_SYMBOL(sysctl_mptcp_set_backup); + +int sysctl_mptcp_probe_interval_secs = 0; //seconds, 0 = probe disabled +EXPORT_SYMBOL(sysctl_mptcp_probe_interval_secs); + +//int sysctl_mptcp_rate = 0; +//EXPORT_SYMBOL(sysctl_mptcp_rate); + +#define REPORT_BUF_SIZE_MAX 500 +u64 prev_tx_bytes = 0, prev_tstamp = 0; + +struct ratio_sched_priv { + u16 quota; + u32 write_seq_saved; + //u32 write_seq_jiffies; + struct timeval write_seq_tv, snd_una_tv; + u64 completion_time; + u8 is_accounting, is_init_accounted; + u32 snd_una_saved, buffer_size; + u32 delivered; +}; + +static struct ratio_sched_priv *ratio_sched_get_priv(const struct tcp_sock *tp) +{ + return (struct ratio_sched_priv *)&tp->mptcp->mptcp_sched[0]; +} + +/* end: shivanga */ bool mptcp_init_failed __read_mostly; struct static_key mptcp_static_key = STATIC_KEY_INIT_FALSE; @@ -117,6 +170,380 @@ static int proc_mptcp_scheduler(struct ctl_table *ctl, int write, return ret; } +/* shivanga */ + +#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \ + do { \ + si4.sin_family = AF_INET; \ + si4.sin_port = inet->inet_##mem##port; \ + si4.sin_addr.s_addr = inet->inet_##mem##addr; \ + } while (0) \ + +static int proc_mptcp_bytes_not_sent(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char val[MPTCP_BYTES_NOT_SENT_MAX]; + int val_length = 0; + struct ctl_table tbl = { + .data = val, + .maxlen = MPTCP_BYTES_NOT_SENT_MAX, + }; + int ret; + + struct tcp_sock *meta_tp; + int i; + + memset(val, 0, MPTCP_BYTES_NOT_SENT_MAX); + for (i = 0; i < MPTCP_HASH_SIZE; i++) { + struct hlist_nulls_node *node; + rcu_read_lock_bh(); + hlist_nulls_for_each_entry_rcu(meta_tp, node, + &tk_hashtable[i], tk_table) { + //struct sock *sk_it; + struct mptcp_tcp_sock *mptcp_sock; + struct mptcp_cb *mpcb = meta_tp->mpcb; + int iter; + + /*Ask shivang whaht this is*/ + if (!mptcp(meta_tp)) + continue; + + if (!mpcb) + continue; + + iter = 0; + /*Phuc*/ + /*Double check with Shivang for these type casts*/ + mptcp_for_each_sub(mpcb, mptcp_sock) { + //struct tcp_sock *tp_it = tcp_sk(sk_it); + const struct sock *sk = mptcp_to_sock(mptcp_sock); + const struct tcp_sock *tp_it = tcp_sk(sk); + const struct inet_sock *inet = inet_sk(sk);//do we need const? + union { + struct sockaddr raw; + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } dst; + + tcp_probe_copy_fl_to_si4(inet, dst.v4, d); + val_length += sprintf(val + val_length, "%pISpc ", &dst); + val_length += sprintf(val + val_length, "%u\n", tp_it->write_seq - tp_it->snd_una); + iter++; + } + /*******/ + val_length += sprintf(val + val_length, "\n"); + } + + rcu_read_unlock_bh(); + } + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + + return ret; +} +/* +static int proc_mptcp_num_segments_flow_one(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int val = 0; + struct ctl_table tbl = { + .data = &val, + .maxlen = sizeof(int), + }; + int ret; + + val = sysctl_num_segments_flow_one; + + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) { + struct tcp_sock *meta_tp; + int i; + sysctl_num_segments_flow_one = val; + + for (i = 0; i < MPTCP_HASH_SIZE; i++) { + struct hlist_nulls_node *node; + rcu_read_lock_bh(); + hlist_nulls_for_each_entry_rcu(meta_tp, node, + &tk_hashtable[i], tk_table) { + struct sock *sk_it; + struct mptcp_cb *mpcb = meta_tp->mpcb; + int iter; + + if (!mptcp(meta_tp)) + continue; + + if (!mpcb) + continue; + + iter = 0; + mptcp_for_each_sk(mpcb, sk_it) { + struct tcp_sock *tp_it = tcp_sk(sk_it); + struct ratio_sched_priv *rsp = ratio_sched_get_priv(tp_it); + //rsp->quota = 0; + iter++; + } + } + + rcu_read_unlock_bh(); + } + + } + return ret; +} +*/ +static int proc_mptcp_set_pf(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int val = 0, tempval = 0; + struct ctl_table tbl = { + .data = &val, + .maxlen = sizeof(int), + }; + int ret; + + int i; + struct tcp_sock *meta_tp; + for (i = 0; i < MPTCP_HASH_SIZE; i++) { + struct hlist_nulls_node *node; + rcu_read_lock_bh(); + hlist_nulls_for_each_entry_rcu(meta_tp, node, + &tk_hashtable[i], tk_table) { + //struct sock *sk_it; + struct mptcp_tcp_sock *mptcp_sock; + struct mptcp_cb *mpcb = meta_tp->mpcb; + + if (!mptcp(meta_tp)) + continue; + + if (!mpcb) + continue; + + /*Phuc*/ + mptcp_for_each_sub(mpcb, mptcp_sock) { + + struct sock *sk_it = mptcp_to_sock(mptcp_sock); + struct tcp_sock *tp_it = tcp_sk(sk_it); + if (tp_it->pf) { + tempval = 1; + break; + } + + } + /*****/ + if (tempval == 1) + break; + } + + rcu_read_unlock_bh(); + if (tempval == 1) + break; + } + + val = tempval; + + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) { + struct tcp_sock *meta_tp; + int i; + + for (i = 0; i < MPTCP_HASH_SIZE; i++) { + struct hlist_nulls_node *node; + rcu_read_lock_bh(); + hlist_nulls_for_each_entry_rcu(meta_tp, node, + &tk_hashtable[i], tk_table) { + //struct sock *sk_it; + struct mptcp_tcp_sock *mptcp_sock; + struct mptcp_cb *mpcb = meta_tp->mpcb; + int iter = 0; + + if (!mptcp(meta_tp)) + continue; + + if (!mpcb) + continue; + /*phuc*/ + mptcp_for_each_sub(mpcb, mptcp_sock) { + struct sock *sk_it = mptcp_to_sock(mptcp_sock); + struct tcp_sock *tp_it = tcp_sk(sk_it); + tp_it->pf = val; + iter++; + if (tp_it->prior_ssthresh) { + const struct inet_connection_sock *icsk = inet_csk(sk_it); + + tp_it->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk_it); + if (tp_it->prior_ssthresh > tp_it->snd_ssthresh) { + tp_it->snd_ssthresh = tp_it->prior_ssthresh; + } + tcp_set_ca_state(sk_it, TCP_CA_Recovery); + } + } + /*****/ + } + + rcu_read_unlock_bh(); + } + + } + return ret; +} + +static int proc_mptcp_rate(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + u32 val = 0, tempval = 0; + struct ctl_table tbl = { + .data = &val, + .maxlen = sizeof(u32), + }; + + int ret; + + struct tcp_sock *meta_tp; + int i; + + for (i = 0; i < MPTCP_HASH_SIZE; i++) { + struct hlist_nulls_node *node; + rcu_read_lock_bh(); + hlist_nulls_for_each_entry_rcu(meta_tp, node, + &tk_hashtable[i], tk_table) { + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct sock *sk = NULL; + struct dst_entry *dst; + + struct netdev_queue *txq0; + struct rtnl_link_stats64 temp; + //const struct rtnl_link_stats64 *stats; + u32 tput = 0; + + if (!mptcp(meta_tp)) + continue; + + if (!mpcb) + continue; + + sk = &((meta_tp->inet_conn).icsk_inet.sk); + + if (sk) { + //printk("if sk\n"); + dst = sk_dst_get(sk); + + if (dst && dst->dev) { + const struct rtnl_link_stats64 *stats = dev_get_stats(dst->dev, &temp); + //printk("if dst->dev\n"); + //printk("%s\n", dst->dev->name); + if (strcmp(dst->dev->name, "enp5s0")) continue; + txq0 = netdev_get_tx_queue(dst->dev, 0); //get txqueueu from dst + + if (stats && txq0) { + //printk("if stats && txq0\n"); + if (!prev_tx_bytes) prev_tx_bytes = stats->tx_bytes; + if (!prev_tstamp) prev_tstamp = txq0->trans_start; + + if (prev_tx_bytes && prev_tstamp && txq0->trans_start != prev_tstamp && jiffies_to_msecs(txq0->trans_start - prev_tstamp)) { + //printk("prev_bytes: %llu, prev_tstamp: %llu, cur_bytes: %llu, cur_tstamp: %lu", prev_tx_bytes, prev_tstamp, stats->tx_bytes, txq0->trans_start); + tput = ((stats->tx_bytes - prev_tx_bytes)*8)/(jiffies_to_msecs(txq0->trans_start - prev_tstamp)); + //printk("rate: %llu\n", tput); + prev_tx_bytes = stats->tx_bytes; + prev_tstamp = txq0->trans_start; + if (!tempval) { + tempval = tput; + //printk("tempval = tput: %u\n", tempval); + } //else printk("tempval again\n"); + break; + } + } + } + } + } + + rcu_read_unlock_bh(); + + //if (tempval) break; + + } + + val = tempval; + + //printk("ret: %u\n", val); + + ret = proc_douintvec(&tbl, write, buffer, lenp, ppos); + + return ret; +} + + +static int proc_mptcp_buffer_size(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char val[REPORT_BUF_SIZE_MAX]; + int val_length = 0; + struct ctl_table tbl = { + .data = val, + .maxlen = REPORT_BUF_SIZE_MAX, + }; + int ret; + + struct tcp_sock *meta_tp; + int i; + + memset(val, 0, REPORT_BUF_SIZE_MAX); + for (i = 0; i < MPTCP_HASH_SIZE; i++) { + struct hlist_nulls_node *node; + rcu_read_lock_bh(); + hlist_nulls_for_each_entry_rcu(meta_tp, node, + &tk_hashtable[i], tk_table) { + //struct sock *sk_it, *sk; + struct sock *sk; + struct mptcp_tcp_sock *mptcp_sock; + struct mptcp_cb *mpcb = meta_tp->mpcb; + int iter; + + if (!mptcp(meta_tp)) + continue; + + if (!mpcb) + continue; + + sk = &((meta_tp->inet_conn).icsk_inet.sk); + if (sk) { + struct dst_entry *dst = sk_dst_get(sk); + + if (dst && dst->dev) { + if (strcmp(dst->dev->name, "enp5s0")) continue; + iter = 0; + /*phuc*/ + mptcp_for_each_sub(mpcb, mptcp_sock) { + struct sock *sk_it = mptcp_to_sock(mptcp_sock); + struct tcp_sock *tp_it = tcp_sk(sk_it); + struct ratio_sched_priv *rsp = ratio_sched_get_priv(tp_it); + /*****/ + if (rsp->delivered) { + do_div(rsp->buffer_size, rsp->delivered); + val_length += sprintf(val + val_length, "%u ", rsp->buffer_size); + rsp->buffer_size = 0; + rsp->delivered = 0; + //val_length += sprintf(val + val_length, "%u\n", tp_it->write_seq - tp_it->snd_una); + } + iter++; + } + if (meta_tp->delivered) { + val_length += sprintf(val + val_length, "\n"); + break; + } + } + } + } + + rcu_read_unlock_bh(); + } + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + + return ret; +} + static struct ctl_table mptcp_table[] = { { .procname = "mptcp_enabled", @@ -167,6 +594,97 @@ static struct ctl_table mptcp_table[] = { .maxlen = MPTCP_SCHED_NAME_MAX, .proc_handler = proc_mptcp_scheduler, }, + /* swetankk */ + { + .procname = "mptcp_scheduler_optimizations_disabled", + .data = &sysctl_mptcp_scheduler_optimizations_disabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "mptcp_set_backup", + .data = &sysctl_mptcp_set_backup, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + /* end: swetankk */ + /* shivanga */ + { + .procname = "num_segments_flow_one", + .data = &sysctl_num_segments_flow_one, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "mptcp_bytes_not_sent", + .maxlen = MPTCP_BYTES_NOT_SENT_MAX, + .mode = 0644, + .proc_handler = proc_mptcp_bytes_not_sent, + }, + { + .procname = "mptcp_pf", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_mptcp_set_pf, + }, + { + .procname = "mptcp_rate", + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_mptcp_rate, + }, + { + .procname = "mptcp_rate_sample", + .data = &sysctl_mptcp_rate_sample, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "mptcp_ratio_static", + .data = &sysctl_mptcp_ratio_static, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "mptcp_ratio_trigger_search", + .data = &sysctl_mptcp_ratio_trigger_search, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "mptcp_ratio_search_step", + .data = &sysctl_mptcp_ratio_search_step, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "mptcp_trigger_threshold", + .data = &sysctl_mptcp_trigger_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "mptcp_probe_interval_secs", + .data = &sysctl_mptcp_probe_interval_secs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "mptcp_buffer_size", + .maxlen = REPORT_BUF_SIZE_MAX, + .mode = 0644, + .proc_handler = proc_mptcp_buffer_size, + }, + /* end: shivanga */ { } }; diff --git a/net/mptcp/mptcp_queue_probe.c b/net/mptcp/mptcp_queue_probe.c new file mode 100644 index 0000000000000..0036fa44a3d90 --- /dev/null +++ b/net/mptcp/mptcp_queue_probe.c @@ -0,0 +1,253 @@ +/* + * mptcp_queue_probe - Observe the MPTCP meta ofo and recv queues with kretprobes. + * + * The idea for this came from Werner Almesberger's umlsim + * Copyright (C) 2004, Stephen Hemminger + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Swetank Kumar Saha "); +MODULE_DESCRIPTION("MPTCP meta queues snooper"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0"); + +static int port __read_mostly; +MODULE_PARM_DESC(port, "Port to match (0=all)"); +module_param(port, int, 0); + +static unsigned int bufsize __read_mostly = 4096; +MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); +module_param(bufsize, uint, 0); + +static const char procname[] = "mptcp_queue_probe"; + +struct tcp_log { + ktime_t tstamp; + + u8 queue; + u32 queue_size; + u32 ofo_tstamp; + u32 seq; + u32 end_seq; + u8 operation; +}; + +static struct { + spinlock_t lock; + wait_queue_head_t wait; + ktime_t start; + u32 lastcwnd; + + unsigned long head, tail; + struct tcp_log *log; +} tcp_probe; + +static inline int tcp_probe_used(void) +{ + return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1); +} + +static inline int tcp_probe_avail(void) +{ + return bufsize - tcp_probe_used() - 1; +} + +#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \ + do { \ + si4.sin_family = AF_INET; \ + si4.sin_port = inet->inet_##mem##port; \ + si4.sin_addr.s_addr = inet->inet_##mem##addr; \ + } while (0) \ + +static void log_tcp_params(struct mptcp_queue_probe* qprobe) +{ + //const struct tcp_sock *meta_tp = qprobe->meta_tp; + //const struct mptcp_cb *mpcb = meta_tp->mpcb; + + spin_lock(&tcp_probe.lock); + /* If log fills, just silently drop*/ + if (tcp_probe_avail() > 1){ + struct tcp_log *p = tcp_probe.log + tcp_probe.head; + + p->tstamp = ktime_get(); + + p->queue = qprobe->q_id; /* 0: RCV_Q, 1: OFO_Q */ + p->queue_size = qprobe->q_size; + p->operation = qprobe->op_id; + p->seq = qprobe->skb_seq; + p->end_seq = qprobe->skb_end_seq; + + tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); + } + spin_unlock(&tcp_probe.lock); + wake_up(&tcp_probe.wait); +} + +static int kmptcp_queue_probe_log_hook(struct kretprobe_instance *ri, struct pt_regs *regs) { + struct mptcp_queue_probe* qprobe; + + qprobe = (struct mptcp_queue_probe*) regs_return_value(regs); + log_tcp_params(qprobe); + return 0; +} + +static struct kretprobe mptcp_kprobe = { + .kp = { + .symbol_name = "mptcp_queue_probe_log_hook", + }, + .handler = kmptcp_queue_probe_log_hook, +}; + +static int tcpprobe_open(struct inode *inode, struct file *file) +{ + /* Reset (empty) log */ + spin_lock_bh(&tcp_probe.lock); + tcp_probe.head = tcp_probe.tail = 0; + tcp_probe.start = ktime_get(); + spin_unlock_bh(&tcp_probe.lock); + + return 0; +} + +static int tcpprobe_sprint(char *tbuf, int n) +{ + const struct tcp_log *p + = tcp_probe.log + tcp_probe.tail; + struct timespec tv + = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); + + return scnprintf(tbuf, n, + "%lu.%09lu %x %u %u %u %x\n", + (unsigned long)tv.tv_sec, + (unsigned long)tv.tv_nsec, p->queue, p->seq, p->end_seq, p->queue_size, p->operation); +} + +static ssize_t tcpprobe_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + int error = 0; + size_t cnt = 0; + + if (!buf) + return -EINVAL; + + while (cnt < len) { + char tbuf[256]; + int width; + + /* Wait for data in buffer */ + error = wait_event_interruptible(tcp_probe.wait, + tcp_probe_used() > 0); + + if (error) + break; + + spin_lock_bh(&tcp_probe.lock); + if (tcp_probe.head == tcp_probe.tail) { + /* multiple readers race? */ + spin_unlock_bh(&tcp_probe.lock); + continue; + } + + width = tcpprobe_sprint(tbuf, sizeof(tbuf)); + + if (cnt + width < len) + tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1); + + spin_unlock_bh(&tcp_probe.lock); + + /* if record greater than space available + return partial buffer (so far) */ + if (cnt + width >= len) + break; + + if (copy_to_user(buf + cnt, tbuf, width)) + return -EFAULT; + + cnt += width; + } + + return cnt == 0 ? error : cnt; +} + +static const struct file_operations tcpprobe_fops = { + .owner = THIS_MODULE, + .open = tcpprobe_open, + .read = tcpprobe_read, + .llseek = noop_llseek, +}; + +static __init int tcpprobe_init(void) +{ + int ret = -ENOMEM; + + /* Warning: if the function signature of tcp_rcv_established, + * has been changed, you also have to change the signature of + * jtcp_rcv_established, otherwise you end up right here! + */ + + init_waitqueue_head(&tcp_probe.wait); + spin_lock_init(&tcp_probe.lock); + + if (bufsize == 0) + return -EINVAL; + + bufsize = roundup_pow_of_two(bufsize); + tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL); + if (!tcp_probe.log) + goto err0; + + if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops)) + goto err0; + + ret = register_kretprobe(&mptcp_kprobe); + + if (ret) + goto err1; + + pr_info("probe registered bufsize=%u\n", bufsize); + return 0; + err1: + remove_proc_entry(procname, init_net.proc_net); + err0: + kfree(tcp_probe.log); + return ret; +} +module_init(tcpprobe_init); + +static __exit void tcpprobe_exit(void) +{ + remove_proc_entry(procname, init_net.proc_net); + unregister_kretprobe(&mptcp_kprobe); + kfree(tcp_probe.log); +} +module_exit(tcpprobe_exit); diff --git a/net/mptcp/mptcp_ratio.c b/net/mptcp/mptcp_ratio.c new file mode 100644 index 0000000000000..82624b7bfd727 --- /dev/null +++ b/net/mptcp/mptcp_ratio.c @@ -0,0 +1,1018 @@ +/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */ + +#include +#include +#include +// swetankk +#include + +// shivanga +#define timersub(tvp, uvp, vvp) \ + do { \ + (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ + (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ + if ((vvp)->tv_usec < 0) { \ + (vvp)->tv_sec--; \ + (vvp)->tv_usec += 1000000; \ + } \ + } while (0) + +static unsigned int num_segments __read_mostly = 100; +module_param(num_segments, uint, 0644); +MODULE_PARM_DESC(num_segments, "The number of consecutive segments that are part of a burst"); + +static bool cwnd_limited __read_mostly = 0; +module_param(cwnd_limited, bool, 0644); +MODULE_PARM_DESC(cwnd_limited, "if set to 1, the scheduler tries to fill the congestion-window on all subflows"); + +struct ratio_sched_priv { + u16 quota; + u32 write_seq_saved; + //u32 write_seq_jiffies; + struct timeval write_seq_tv, snd_una_tv; + u64 completion_time; + u8 is_accounting, is_init_accounted; + u32 snd_una_saved, buffer_size; + u32 delivered; +}; + +static struct ratio_sched_priv *ratio_sched_get_priv(const struct tcp_sock *tp) +{ + return (struct ratio_sched_priv *)&tp->mptcp->mptcp_sched[0]; +} + +u8 sample_skip_ad = 2, sample_skip_ac = 2; +struct sock *blocked_sk = NULL; +unsigned int num_segments_flow_one; //WILL THIS BE CREATED FOR EACH COPY? + +/* If the sub-socket sk available to send the skb? */ +static bool mptcp_ratio_is_available(struct sock *sk, const struct sk_buff *skb, + bool zero_wnd_test, bool cwnd_test) +{ +/*Availabity: in_flightmptcp->pre_established) { + //printk("tp is in PRE_ESTABLISHED state"); + return false; + } + //printk("tp is not in PRE_ESTABLISHED state"); + + if (tp->pf) { + //printk("tp->pf is set"); + return false; + } + + + //printk("tp->pf is not set"); + + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) { + /* If SACK is disabled, and we got a loss, TCP does not exit + * the loss-state until something above high_seq has been acked. + * (see tcp_try_undo_recovery) + * + * high_seq is the snd_nxt at the moment of the RTO. As soon + * as we have an RTO, we won't push data on the subflow. + * Thus, snd_una can never go beyond high_seq. + */ + if (!tcp_is_reno(tp)) { + printk("tcp_is_reno"); + return false; + } + else if (tp->snd_una != tp->high_seq) { + printk("tp->snd_una != tp->high_seq"); + return false; + } + } + + //printk("TCP_CA_Loss"); + + if (!tp->mptcp->fully_established) { + /* Make sure that we send in-order data */ + if (skb && tp->mptcp->second_packet && + tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq) { + printk("tp->mptcp->fully_established is false"); + return false; + } + } + + //printk("tp->mptcp->fully_established is true"); + + if (!cwnd_test) + goto zero_wnd_test; + + in_flight = tcp_packets_in_flight(tp); + /* Not even a single spot in the cwnd */ + if (in_flight >= tp->snd_cwnd) + return false; + + /* Now, check if what is queued in the subflow's send-queue + * already fills the cwnd. + */ + space = (tp->snd_cwnd - in_flight) * tp->mss_cache; + + if (tp->write_seq - tp->snd_nxt > space) + return false; + +zero_wnd_test: + if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp))) { + printk("zero_wnd_test"); + return false; + } + + return true; +} + +/* Are we not allowed to reinject this skb on tp? */ +static int mptcp_ratio_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb) +{ + /* If the skb has already been enqueued in this sk, try to find + * another one. + */ + return skb && + /* Has the skb already been enqueued into this subsocket? */ + mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask; +} + +/* We just look for any subflow that as available */ +static struct sock *ratio_get_available_subflow(struct sock *meta_sk, + struct sk_buff *skb, + bool zero_wnd_test) +{ + const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct sock *sk=NULL, *bestsk = NULL, *backupsk = NULL; + struct mptcp_tcp_sock *mptcp; + /****/ + +#ifdef MPTCP_SCHED_PROBE + struct mptcp_tcp_sock *mptcp_it; + struct mptcp_sched_probe sprobe; + unsigned long sched_probe_id; + + mptcp_sched_probe_init(&sprobe); + get_random_bytes(&sched_probe_id, sizeof(sched_probe_id)); +#endif + + if (mpcb->cnt_subflows == 1) { + /* if there is only one subflow, bypass the scheduling function */ + sk = (struct sock *)mpcb->connection_list; + if (!mptcp_ratio_is_available(sk, skb, false, cwnd_limited)) + sk = NULL; + return sk; + } + + if (meta_sk->sk_shutdown & RCV_SHUTDOWN && + skb && mptcp_is_data_fin(skb)) { + /* Answer data_fin on same subflow!!! */ + mptcp_for_each_sub(mpcb, mptcp) { + sk = mptcp_to_sock(mptcp); +#ifdef MPTCP_SCHED_PROBE + if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index && + mptcp_ratio_is_available(sk, skb, zero_wnd_test, true)) { + if (sk) mptcp_sched_probe_log_hook(&sprobe, true, sched_probe_id, sk); + return sk; + } +#else + if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index && + mptcp_ratio_is_available(sk, skb, zero_wnd_test, true)) + return sk; +#endif + } + } + + /* First, find the best subflow */ + mptcp_for_each_sub(mpcb, mptcp) { + struct tcp_sock *tp = tcp_sk(mptcp_to_sock(mptcp)); + if (!mptcp_ratio_is_available(mptcp_to_sock(mptcp), skb, zero_wnd_test, true)) + continue; + + if (mptcp_ratio_dont_reinject_skb(tp, skb)) { + backupsk = mptcp_to_sock(mptcp); + continue; + } + + bestsk = mptcp_to_sock(mptcp); + } + + if (bestsk) { + sk = bestsk; + } + else if (backupsk) { + /* It has been sent on all subflows once - let's give it a + * chance again by restarting its pathmask. + */ + if (skb) + TCP_SKB_CB(skb)->path_mask = 0; + sk = backupsk; + } +#ifdef MPTCP_SCHED_PROBE + mptcp_for_each_sub(mpcb, mptcp_it) { + struct sock *sk_it = mptcp_to_sock(mptcp_it); + if (sk && sk_it == sk) mptcp_sched_probe_log_hook(&sprobe, true, sched_probe_id, sk); + else mptcp_sched_probe_log_hook(&sprobe, false, sched_probe_id, sk); + } +#endif + return sk; +} + +u32 get_queue_size(struct sock *sk, struct tcp_sock *meta_tp){ + struct dst_entry *dst; + struct netdev_queue *txq0; + struct dql *dql0; + struct Qdisc *qdisc; + struct rtnl_link_stats64 temp; + u32 packets_in_queue; + u64 tput = 0; + + dst = sk_dst_get(sk); + + if (dst->dev) { + const struct rtnl_link_stats64 *stats = dev_get_stats(dst->dev, &temp); + + txq0 = netdev_get_tx_queue(dst->dev, 0); //get txqueueu from dst + dql0 = &txq0->dql; + qdisc = txq0->qdisc; + + if (!meta_tp->prev_tx_bytes) meta_tp->prev_tx_bytes = stats->tx_bytes; + if (!meta_tp->prev_tstamp) meta_tp->prev_tstamp = txq0->trans_start; + + if (meta_tp->prev_tx_bytes && meta_tp->prev_tstamp && txq0->trans_start != meta_tp->prev_tstamp) { + tput = ((stats->tx_bytes - meta_tp->prev_tx_bytes)*8)/(jiffies_to_msecs(txq0->trans_start - meta_tp->prev_tstamp)); + meta_tp->prev_tx_bytes = stats->tx_bytes; + meta_tp->prev_tstamp = txq0->trans_start; + } + + packets_in_queue = dql0->num_queued - dql0->num_completed; //number of packets in DQL + + } + return tput; +} + +/* Returns the next segment to be sent from the mptcp meta-queue. + * (chooses the reinject queue if any segment is waiting in it, otherwise, + * chooses the normal write queue). + * Sets *@reinject to 1 if the returned segment comes from the + * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, + * and sets it to -1 if it is a meta-level retransmission to optimize the + * receive-buffer. + */ +static struct sk_buff *__mptcp_ratio_next_segment(const struct sock *meta_sk, int *reinject) +{ + const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct sk_buff *skb = NULL; + + *reinject = 0; + + /* If we are in fallback-mode, just take from the meta-send-queue */ + if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) + return tcp_send_head(meta_sk); + + skb = skb_peek(&mpcb->reinject_queue); + + if (skb) + *reinject = 1; + else + skb = tcp_send_head(meta_sk); + + return skb; +} + +#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \ + do { \ + si4.sin_family = AF_INET; \ + si4.sin_port = inet->inet_##mem##port; \ + si4.sin_addr.s_addr = inet->inet_##mem##addr; \ + } while (0) \ + + +static struct sk_buff *mptcp_ratio_next_segment(struct sock *meta_sk, + int *reinject, + struct sock **subsk, + unsigned int *limit) +{ + const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct sock *choose_sk=NULL;//chosen socket + struct mptcp_tcp_sock *mptcp;//an mptcp_socket + struct sk_buff *skb = __mptcp_ratio_next_segment(meta_sk, reinject); + unsigned int split = num_segments;// + unsigned char iter = 0, full_subs = 0, counter = 0; + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + + u32 total_rate, rate_ad, rate_ac; + u32 last_rate, in_search, last_trigger_tstamp, count_set_init_rate, init_rate; + u32 buffer_total, init_buffer_total, trigger_threshold; + u32 srtt[2]={0xffffffff,0xffffffff}; + u32 min_rtt[2]={0xffffffff,0xffffffff}; + u32 num_acks[2]={0,0}; + u32 num_acks_head[2]={0,0}; + int rate_diff, buffer_diff; + int rate_diff_sub[2] = {0,0}; + int buffer_sub[2] = {0,0}; + u32 last_buffer_size[2] = {0, 0}, init_buffer_size[2] = {0, 0}, tput[2] = {0, 0}; + u16 head_length; + u8 threshold_cnt; + u8 buffer_threshold_cnt; + unsigned int time_diff, loop_counter = 0; + +#ifdef MPTCP_SCHED_PROBE + struct mptcp_sched_probe sprobe; + unsigned long sched_probe_id; + + get_random_bytes(&sched_probe_id, sizeof(sched_probe_id)); + mptcp_sched_probe_init(&sprobe); + +#endif + /*Intial parameter setup for meta_tp*/ + if (meta_tp->run_started == 0) { + meta_tp->run_started = 1; + num_segments_flow_one = meta_tp->num_segments_flow_one = sysctl_num_segments_flow_one; + meta_tp->ratio_search_step = sysctl_mptcp_ratio_search_step; + meta_tp->ratio_rate_sample = sysctl_mptcp_rate_sample; + if(inet_sk(meta_sk)->inet_daddr) + printk("MPTCP Run Started, destination: %pI4", &inet_sk(meta_sk)->inet_daddr); + else + printk("MPTCP Run Started"); + } + + + /* As we set it, we have to reset it as well. */ + *limit = 0; + + if (!skb) + return NULL; + + if (*reinject) { + /*Reinjected segment*/ + *subsk = ratio_get_available_subflow(meta_sk, skb, false); + if (!*subsk) + return NULL; + + return skb; + } + +/*Schedule the next segment*/ + +retry: + mptcp_for_each_sub(mpcb, mptcp) { + /*Get scheduler private information*/ + struct sock *sk_it = mptcp_to_sock(mptcp); + struct tcp_sock *tp_it = tcp_sk(sk_it); + struct ratio_sched_priv *rsp = ratio_sched_get_priv(tp_it); + /***********************************/ + + const struct inet_sock *inet = inet_sk(sk_it); + union { + struct sockaddr raw; + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } dst; + + + counter++;//this is to keep track of subflow index + tcp_probe_copy_fl_to_si4(inet, dst.v4, d); //useless + + /*Sanity check*/ + if (!mptcp_ratio_is_available(sk_it, skb, false, cwnd_limited)){ + //printk("flow rejected"); + continue; + } + + /*Counter to compare with full_subs for round restart*/ + iter++; + + /* Check subflow with odd index + * full_sub > 0: subflow reached full quota*/ + if (counter % 2) { + if (meta_tp->num_segments_flow_one == 0) { + full_subs++; + continue; + } + + /*This subflow is being used but not yet reached the quota*/ + if (rsp->quota > 0 && rsp->quota < meta_tp->num_segments_flow_one) { + split = meta_tp->num_segments_flow_one - rsp->quota; + choose_sk = sk_it;//choose this subflow + goto found; + } + + /*Nothing scheduled on this subflow yet: choose it*/ + if (!rsp->quota) { + split = meta_tp->num_segments_flow_one; + choose_sk = sk_it; + } + + /* Or, it must then be fully used*/ + if (rsp->quota >= meta_tp->num_segments_flow_one) + full_subs++; + } + /* Consider the even-indexed subflows*/ + else { + /*This subflow has reached full quota*/ + if (num_segments - meta_tp->num_segments_flow_one == 0) { + full_subs++; + continue; + } + + /*This subflow is being used but not yet reached the quota*/ + if (rsp->quota > 0 && rsp->quota < (num_segments - meta_tp->num_segments_flow_one)) { + split = (num_segments - meta_tp->num_segments_flow_one) - rsp->quota; + choose_sk = sk_it; + goto found; + } + + /*Nothing scheduled on this subflow yet*/ + if (!rsp->quota) { + split = num_segments - meta_tp->num_segments_flow_one; + choose_sk = sk_it; + } + + /* Or, it must then be fully used */ + if (rsp->quota >= (num_segments - meta_tp->num_segments_flow_one)) + full_subs++; + } + } + + /* All subflows reach quota, we restart this round by setting quota to 0 and retry + * to find a subflow. + */ + if (iter && iter == full_subs) { + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + struct tcp_sock *tp_it = tcp_sk(sk_it); + struct ratio_sched_priv *rsp = ratio_sched_get_priv(tp_it); + + if (!mptcp_ratio_is_available(sk_it, skb, false, cwnd_limited)) + continue; + + rsp->quota = 0; + } + goto retry; + } + +found: + /*We have fould the chosen socket*/ + if (choose_sk) { + unsigned int mss_now; + struct tcp_sock *choose_tp = tcp_sk(choose_sk); + struct ratio_sched_priv *rsp = ratio_sched_get_priv(choose_tp); + const struct inet_sock *inet = inet_sk(choose_sk); + + union { + struct sockaddr raw; + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } dst; + + if (!mptcp_ratio_is_available(choose_sk, skb, false, true)) + return NULL; + + tcp_probe_copy_fl_to_si4(inet, dst.v4, d); + *subsk = choose_sk; + mss_now = tcp_current_mss(*subsk); + *limit = split * mss_now; + + /*The number of quota would be how many segements that we decided to send + * on the choose_sk*/ + if (skb->len > mss_now) + rsp->quota += DIV_ROUND_UP(skb->len, mss_now); + else + rsp->quota++; + + //printk("skb->len: %u, mss_now: %u, mss_cache: %u",skb->len , mss_now, choose_tp->mss_cache); +#ifdef MPTCP_SCHED_PROBE + iter = total_rate = rate_ad = rate_ac = 0; + + /*Supposedly useless*/ + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + struct tcp_sock *tp_it = tcp_sk(sk_it); + u32 subflow_rate, subflow_intv; + u64 subflow_rate64 = 0; + + mptcp_sched_probe_init(&sprobe); + iter++; + + subflow_rate = READ_ONCE(tp_it->rate_delivered); + subflow_intv = READ_ONCE(tp_it->rate_interval_us); + if (subflow_rate && subflow_intv) { + subflow_rate64 = (u64)subflow_rate * tp_it->mss_cache * USEC_PER_SEC; + do_div(subflow_rate64, subflow_intv); + subflow_rate64 *= 8; + + if (subflow_rate64 != tp_it->last_ac_rate) { + if (iter == 1) { + rate_ad += subflow_rate64; + } else { + rate_ac += subflow_rate64; + } + tp_it->last_ac_rate = subflow_rate64; + do_div(subflow_rate64, 1000000); + tp_it->rate_est_val += subflow_rate64; + tp_it->rate_est_cnt++; + tp_it->in_probe = 0; + total_rate += subflow_rate64; + } else + tp_it->in_probe++; + } + + + if (!mptcp_ratio_is_available(sk_it, skb, false, cwnd_limited)) sprobe.temp_unavailable = true; + + if (choose_sk == sk_it) { + mptcp_sched_probe_log_hook(&sprobe, true, sched_probe_id, sk_it); + } + else mptcp_sched_probe_log_hook(&sprobe, false, sched_probe_id, sk_it); + }/*Supposedly useless*/ + + /* AUTO-RATE search */ + do_div(total_rate, 1000000); + meta_tp->rate_delivered += total_rate;//no use + meta_tp->delivered++; + + + iter = 0; + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + struct tcp_sock *tp_it_temp = tcp_sk(sk_it); + struct ratio_sched_priv *rsp_temp = ratio_sched_get_priv(tp_it_temp); + rsp_temp->buffer_size += (tp_it_temp->write_seq - tp_it_temp->snd_una); + buffer_sub[iter] = rsp_temp->buffer_size; + rsp_temp->delivered++; + iter++; + } + + + time_diff = jiffies_to_msecs(jiffies - meta_tp->rate_interval_us); + meta_tp->head_length = meta_tp->ratio_rate_sample/2; + if(time_diff== meta_tp->head_length && meta_tp->lost){ + in_search = meta_tp->lost; + //printk("Time elapsed since last change: %u", time_diff); + iter = 0; + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + struct tcp_sock *tp_it_temp = tcp_sk(sk_it); + u32 subflow_rate; + subflow_rate = tp_it_temp->delivered - tp_it_temp->prev_tx_bytes; + meta_tp->num_acks_head[iter] = subflow_rate; + iter++; + } + // printk("Number of ACKs collected: %u", + // meta_tp->num_acks_head[0] + meta_tp->num_acks_head[1]); + } + + /*start dynamic ratio search*/ + if (time_diff >= meta_tp->ratio_rate_sample) { + /*Load parameter from previous probe interval*/ + last_rate = meta_tp->prr_out;//get last_rate from the container + trigger_threshold = meta_tp->prr_delivered; + in_search = meta_tp->lost; + threshold_cnt = (meta_tp->snd_ssthresh == INT_MAX) ? 0 : meta_tp->snd_ssthresh; + buffer_threshold_cnt = meta_tp->buffer_threshold_cnt; + last_trigger_tstamp = meta_tp->prior_ssthresh; + count_set_init_rate = meta_tp->total_retrans; + init_rate = meta_tp->prior_cwnd; + + num_acks_head[0] = (in_search==0)? 0: meta_tp->num_acks_head[0]; + num_acks_head[1] = (in_search==0)? 0: meta_tp->num_acks_head[1]; + head_length = (num_acks_head[0]==0)? 0:meta_tp->head_length; + //printk("head_length this period: %u, since search is %d", + // head_length, in_search); + memcpy(init_buffer_size, meta_tp->init_buffer_size, 2*sizeof(u32)); + memcpy(last_buffer_size, meta_tp->last_buffer_size, 2*sizeof(u32)); + /*End loading*/ + + iter = 0; + meta_tp->rate_delivered = 0; //reset this container so that we can use it to calculate rate + + /*Value estimation for each interface*/ + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + struct tcp_sock *tp_it_temp = tcp_sk(sk_it); + struct ratio_sched_priv *rsp_temp = ratio_sched_get_priv(tp_it_temp); + u32 subflow_rate, subflow_intv, curr_tstamp; + u64 subflow_rate64 = 0; + do_div(rsp_temp->buffer_size, meta_tp->delivered); + do_div(buffer_sub[iter], 1000);//KB + curr_tstamp = jiffies; + //printk("Original ACKs %llu", tp_it_temp->delivered- + // tp_it_temp->prev_tx_bytes); + subflow_rate = abs(tp_it_temp->delivered - tp_it_temp->prev_tx_bytes - num_acks_head[iter]);//number of ACKs came back + //printk("Deducted ACKs: %u", subflow_rate); + num_acks[iter] = subflow_rate; + tp_it_temp->prev_tx_bytes = tp_it_temp->delivered; + subflow_intv = jiffies_to_msecs(curr_tstamp + - tp_it_temp->prev_tstamp)-head_length; + tp_it_temp->prev_tstamp = curr_tstamp; + subflow_rate64 = (u64)subflow_rate * tp_it_temp->mss_cache * 8 * MSEC_PER_SEC; + do_div(subflow_rate64, subflow_intv); + do_div(subflow_rate64, 1000000);//subflow_intv is in us + srtt[iter] = tp_it_temp->srtt_us>>3; + min_rtt[iter] = tcp_min_rtt(tp_it_temp); + tput[iter] = subflow_rate64; + meta_tp->rate_delivered += tput[iter];//cummulate rate on both interface + rsp_temp->snd_una_saved = tp_it_temp->snd_una; + iter++; + }/*Value estimation for each interface*/ + + for (iter = 0; iter < 5; iter++) { + if (iter == 4) + meta_tp->last_rate_search_start[iter] = meta_tp->rate_delivered; + else + meta_tp->last_rate_search_start[iter] = meta_tp->last_rate_search_start[iter+1];//keep shifting to get the updated rate + } + + if (inet_sk(meta_sk)->inet_daddr) + { + printk("ratio: %d" + ", rate_ad: %u" + ", rate_ac: %u" + ", srtt_ad: %u" + ", srtt_ac: %u" + ", num_acks_ad: %u" + ", num_acks_ac: %u\n", + meta_tp->num_segments_flow_one, + tput[0], + tput[1], + srtt[0], + srtt[1], + num_acks[0], + num_acks[1]); + printk("rate_thresh_cnt: %d" + ", buffer_thresh_cnt: %d\n", + threshold_cnt, + buffer_threshold_cnt); + } + + if (!in_search && !last_rate) { + /*Calculate the initial rate got started*/ + count_set_init_rate++;//how many count do we like to average out for init rate + printk("Entered: In search = 0, last rate = 0");// + if (count_set_init_rate == 5) { + last_rate = init_rate; + trigger_threshold = 25 * last_rate / 100; + loop_counter = 0; + meta_tp->buffer_trigger_threshold = 0; + mptcp_for_each_sub(mpcb, mptcp) { + meta_tp->buffer_trigger_threshold += init_buffer_size[loop_counter]; + last_buffer_size[loop_counter] = init_buffer_size[loop_counter]; + loop_counter++; + } + meta_tp->buffer_trigger_threshold = -15 * meta_tp->buffer_trigger_threshold / 100; + count_set_init_rate = 0; + } else { + init_rate = (init_rate * (count_set_init_rate - 1) + meta_tp->rate_delivered) / count_set_init_rate; + loop_counter = 0; + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + struct tcp_sock *tp_it_temp = tcp_sk(sk_it); + struct ratio_sched_priv *rsp_temp = ratio_sched_get_priv(tp_it_temp); + init_buffer_size[loop_counter] = (init_buffer_size[loop_counter] * (count_set_init_rate - 1) + rsp_temp->buffer_size) / count_set_init_rate; + loop_counter++; + } + goto reset; + } + } + + + if (sysctl_mptcp_ratio_trigger_search) { + /*Manual trigger using sysctl*/ + sysctl_mptcp_ratio_trigger_search = 0; + goto search_start; + } + + + if (sysctl_mptcp_probe_interval_secs && last_trigger_tstamp && (jiffies_to_msecs(jiffies - last_trigger_tstamp) >= sysctl_mptcp_probe_interval_secs*1000)) { + printk("Periodic Search\n"); + goto search_start; + } + + + if (!meta_tp->rate_delivered && !last_rate) { + in_search = false; + goto reset; + } + + if (!in_search && last_rate && !sysctl_mptcp_ratio_static) { + /*Not in search but last rate !=0:*/ + rate_diff = (int)meta_tp->rate_delivered - (int)init_rate; + buffer_total = 0, init_buffer_total = 0; + loop_counter = 0; + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + struct tcp_sock *tp_it_temp = tcp_sk(sk_it); + struct ratio_sched_priv *rsp_temp = ratio_sched_get_priv(tp_it_temp); + buffer_total += rsp_temp->buffer_size; + init_buffer_total += init_buffer_size[loop_counter++]; + } + + buffer_diff = (int)buffer_total - (int)init_buffer_total; + + if (rate_diff > trigger_threshold) { + threshold_cnt++; + } else if (buffer_diff < meta_tp->buffer_trigger_threshold) { + buffer_threshold_cnt++; + } else { + buffer_threshold_cnt = 0; + threshold_cnt = 0; + } + + if (!meta_tp->init_search) { + printk("INITIAL SEARCH\n"); + meta_tp->init_search = true; + goto search_start; + } + + if (buffer_threshold_cnt == 5 || threshold_cnt == 5) { + /*Search trigger condition met*/ + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + if (tcp_in_slow_start(tcp_sk(sk_it))) { + printk("in slow start\n"); + goto nosearch; + } + } + + if (jiffies_to_msecs(jiffies - last_trigger_tstamp) <= 2000) { + printk("less than 2 seconds\n"); + goto nosearch; + } +search_start: + printk("SEARCH START:\n"); + /*Triggering conditions*/ + if(buffer_threshold_cnt==5) + { + printk("DECREASED SEND QUEUE\n"); + } + else if(threshold_cnt==5) + { + printk("DECREASED THROUGHPUT\n"); + } + else + { + printk("INITIAL or PERIODIC SEARCH\n"); + } + + /*Set search state and reset counter for the next interval*/ + in_search = true; + threshold_cnt = 0; + buffer_threshold_cnt = 0; + + /*Increase sampling interval*/ + meta_tp->ratio_rate_sample = meta_tp->ratio_rate_sample*4; + last_trigger_tstamp = jiffies; + + if (meta_tp->num_segments_flow_one <= (100 - abs(meta_tp->ratio_search_step))) { + meta_tp->search_state = RIGHT_RATIO_SET; + meta_tp->num_segments_flow_one += meta_tp->ratio_search_step; + } + else { + meta_tp->search_state = SEARCH_RATE; + meta_tp->ratio_search_step = -1*abs(meta_tp->ratio_search_step); + meta_tp->num_segments_flow_one += meta_tp->ratio_search_step; + } + + + last_rate = 0; + /*When search starts, get the average of of the last iter*sampling_time ms for comparable*/ + for (iter = 0; iter < 5; iter++) + last_rate += meta_tp->last_rate_search_start[iter]; + do_div(last_rate, 5); + goto reset; +nosearch: + printk("NO SEARCH\n"); + last_rate = 0; + threshold_cnt = 0; + buffer_threshold_cnt = 0; + goto reset; + } + + } + + + /*Start ratio searching*/ + if (in_search) { + switch(meta_tp->search_state) { + case RIGHT_RATIO_SET: + printk("RIGHT_RATIO_SET"); + if (meta_tp->rate_delivered > last_rate) { + if (meta_tp->num_segments_flow_one + meta_tp->ratio_search_step <= 100) { + meta_tp->num_segments_flow_one += meta_tp->ratio_search_step; + meta_tp->search_state = SEARCH_RATE; + } else { + last_rate = 0; + in_search = false; + meta_tp->ratio_rate_sample = sysctl_mptcp_rate_sample; + goto reset; + } + } else if (meta_tp->num_segments_flow_one - 2*meta_tp->ratio_search_step > 0) { + meta_tp->num_segments_flow_one -= 2*meta_tp->ratio_search_step; + meta_tp->search_state = LEFT_RATIO_SET; + goto reset; + } else { + meta_tp->num_segments_flow_one -= meta_tp->ratio_search_step; + last_rate = 0; + meta_tp->ratio_rate_sample = sysctl_mptcp_rate_sample; + in_search = false; + goto reset; + } + break; + case LEFT_RATIO_SET: + printk("LEFT_RATIO_SET"); + if (meta_tp->rate_delivered > last_rate) { + meta_tp->ratio_search_step = -1*abs(meta_tp->ratio_search_step); + if (meta_tp->num_segments_flow_one > abs(meta_tp->ratio_search_step)) { + meta_tp->num_segments_flow_one += meta_tp->ratio_search_step; + meta_tp->search_state = SEARCH_RATE; + } else { + last_rate = 0; + in_search = false; + meta_tp->ratio_rate_sample = sysctl_mptcp_rate_sample; + goto reset; + } + } else { + meta_tp->num_segments_flow_one += meta_tp->ratio_search_step; + last_rate = 0; + meta_tp->ratio_rate_sample = sysctl_mptcp_rate_sample; + in_search = false; + goto reset; + } + break; + case SEARCH_RATE: + printk("SEARCH_RATE"); + if (meta_tp->rate_delivered < last_rate) { + meta_tp->num_segments_flow_one -= meta_tp->ratio_search_step; + meta_tp->ratio_search_step = abs(meta_tp->ratio_search_step); + if (meta_tp->num_segments_flow_one + meta_tp->ratio_search_step/2 <= 100) { + meta_tp->num_segments_flow_one += meta_tp->ratio_search_step/2; + meta_tp->search_state = RIGHT_RATIO_FINE; + } else { + meta_tp->num_segments_flow_one -= meta_tp->ratio_search_step/2; + meta_tp->search_state = LEFT_RATIO_FINE; + } + goto reset; + } else { + if (meta_tp->num_segments_flow_one + meta_tp->ratio_search_step <= 100 && meta_tp->num_segments_flow_one + meta_tp->ratio_search_step > 0) + meta_tp->num_segments_flow_one += meta_tp->ratio_search_step; + else { + last_rate = 0; + in_search = false; + meta_tp->ratio_rate_sample = sysctl_mptcp_rate_sample; + goto reset; + } + } + break; + case RIGHT_RATIO_FINE: + printk("RIGHT_RATIO_FINE"); + if (meta_tp->rate_delivered > last_rate) { + last_rate = 0; + in_search = false; + meta_tp->ratio_rate_sample = sysctl_mptcp_rate_sample; + } else { + if (meta_tp->num_segments_flow_one > meta_tp->ratio_search_step) { + meta_tp->num_segments_flow_one -= meta_tp->ratio_search_step; + meta_tp->search_state = LEFT_RATIO_FINE; + } else { + meta_tp->num_segments_flow_one -= meta_tp->ratio_search_step/2; + last_rate = 0; + in_search = false; + meta_tp->ratio_rate_sample = sysctl_mptcp_rate_sample; + } + } + goto reset; + break; + case LEFT_RATIO_FINE: + printk("LEFT_RATIO_FINE"); + if (meta_tp->rate_delivered <= last_rate) { + meta_tp->num_segments_flow_one += meta_tp->ratio_search_step/2; + } + last_rate = 0; + in_search = false; + meta_tp->ratio_rate_sample = sysctl_mptcp_rate_sample; + goto reset; + break; + } + }/*End ratio searching*/ + + last_rate = meta_tp->rate_delivered;//if we are not in_search, last_rate is what we collected this interval +reset: + /*Save the calculated parameters this interval*/ + meta_tp->prr_out = last_rate;//rate delivered this interval + meta_tp->prr_delivered = trigger_threshold; + meta_tp->lost = in_search; + meta_tp->snd_ssthresh = threshold_cnt; + meta_tp->buffer_threshold_cnt = buffer_threshold_cnt; + meta_tp->prior_ssthresh = last_trigger_tstamp; + meta_tp->total_retrans = count_set_init_rate; + meta_tp->prior_cwnd = init_rate; + memcpy(meta_tp->init_buffer_size, init_buffer_size, 2*sizeof(u32)); + memcpy(meta_tp->last_buffer_size, last_buffer_size, 2*sizeof(u32)); + + /*Reset the containers for the next intervals*/ + meta_tp->delivered = 0; + meta_tp->rate_delivered = 0; + meta_tp->high_seq = 0; + meta_tp->undo_marker = 0; + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + struct tcp_sock *tp_it_temp = tcp_sk(sk_it); + tp_it_temp->rate_est_val = 0; + tp_it_temp->rate_est_cnt = 0; + } + meta_tp->rate_interval_us = jiffies; + + }/*end dynamic ratio search*/ + // AUTO-RATE search + + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + mptcp_sched_probe_init(&sprobe); + sched_probe_id = ULONG_MAX; + if (choose_sk == sk_it) { + sprobe.split = split; + sprobe.skblen = DIV_ROUND_UP(skb->len, mss_now); + mptcp_sched_probe_log_hook(&sprobe, true, sched_probe_id, sk_it); + break; + } + } +#endif + return skb; + }/*Schedule the chosen socket*/ +#ifdef MPTCP_SCHED_PROBE + iter = 0; + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + mptcp_sched_probe_init(&sprobe);//initialize sprobe + iter++; + + if (!mptcp_ratio_is_available(sk_it, skb, false, cwnd_limited)) sprobe.temp_unavailable = true; + + //Probe the chosen subflow + if (choose_sk == sk_it) { + mptcp_sched_probe_log_hook(&sprobe, true, sched_probe_id, sk_it); + } + //Don't probe the current subflow + else mptcp_sched_probe_log_hook(&sprobe, false, sched_probe_id, sk_it); + } +#endif + return NULL; +}/*End scheduling next segment*/ + +static struct mptcp_sched_ops mptcp_sched_ratio = { + .get_subflow = ratio_get_available_subflow, + .next_segment = mptcp_ratio_next_segment, + .name = "ratio", + .owner = THIS_MODULE, +}; + +static int __init ratio_register(void) +{ + BUILD_BUG_ON(sizeof(struct ratio_sched_priv) > MPTCP_SCHED_SIZE); + + if (mptcp_register_scheduler(&mptcp_sched_ratio)) + return -1; + + num_segments_flow_one = sysctl_num_segments_flow_one; + printk("ratio scheduler init. with params: num_segments: %u, cwnd_limited: %u\n", num_segments, cwnd_limited); + + return 0; +} + + +static void ratio_unregister(void) +{ + mptcp_unregister_scheduler(&mptcp_sched_ratio); +} + +module_init(ratio_register); +module_exit(ratio_unregister); + +MODULE_AUTHOR("Swetank Kumar Saha"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RATIO MPTCP"); +MODULE_VERSION("0.02"); diff --git a/net/mptcp/mptcp_rtt.c b/net/mptcp/mptcp_rtt.c new file mode 100644 index 0000000000000..fdd79b26c92a6 --- /dev/null +++ b/net/mptcp/mptcp_rtt.c @@ -0,0 +1,619 @@ +/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */ + +#include +#include + + +/* Mirrored from net/ipv4/tcp_outpt.c */ +/* Does at least the first segment of SKB fit into the send window? */ +//bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, +// unsigned int cur_mss) +//{ +// u32 end_seq = TCP_SKB_CB(skb)->end_seq; +// +// if (skb->len > cur_mss) +// end_seq = TCP_SKB_CB(skb)->seq + cur_mss; +// +// return !after(end_seq, tcp_wnd_end(tp)); +//} + +/* Mirrored from net/ipv4/tcp_outpt.c */ +/* Can at least one segment of SKB be sent right now, according to the + * congestion window rules? If so, return how many segments are allowed. + */ +//unsigned int tcp_cwnd_test(const struct tcp_sock *tp, +// const struct sk_buff *skb) +//{ +// u32 in_flight, cwnd, halfcwnd; +// +// /* Don't be strict about the congestion window for the final FIN. */ +// if (skb && +// (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && +// tcp_skb_pcount(skb) == 1) +// return 1; +// +// in_flight = tcp_packets_in_flight(tp); +// cwnd = tp->snd_cwnd; +// if (in_flight >= cwnd) +// return 0; +// +// /* For better scheduling, ensure we have at least +// * 2 GSO packets in flight. +// */ +// halfcwnd = max(cwnd >> 1, 1U); +// return min(halfcwnd, cwnd - in_flight); +//} + +struct defsched_priv { + u32 last_rbuf_opti; +}; + +static struct defsched_priv *defsched_get_priv(const struct tcp_sock *tp) +{ + return (struct defsched_priv *)&tp->mptcp->mptcp_sched[0]; +} + +static bool mptcp_is_temp_unavailable(struct sock *sk, + const struct sk_buff *skb, + bool zero_wnd_test) +{ + const struct tcp_sock *tp = tcp_sk(sk); + unsigned int mss_now, space, in_flight; + + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) { + /* If SACK is disabled, and we got a loss, TCP does not exit + * the loss-state until something above high_seq has been + * acked. (see tcp_try_undo_recovery) + * + * high_seq is the snd_nxt at the moment of the RTO. As soon + * as we have an RTO, we won't push data on the subflow. + * Thus, snd_una can never go beyond high_seq. + */ + if (!tcp_is_reno(tp)) { + //printk("shivanga: not is reno\n"); + return true; + } + else if (tp->snd_una != tp->high_seq) { + //printk("shivanga: una != high_seq\n"); + return true; + } + } + + if (!tp->mptcp->fully_established) { + /* Make sure that we send in-order data */ + if (skb && tp->mptcp->second_packet && + tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq) { + //printk("shivanga: fully established\n"); + return true; + } + } + + in_flight = tcp_packets_in_flight(tp); + /* Not even a single spot in the cwnd */ + if (in_flight >= tp->snd_cwnd) { + //printk("no spot in cwnd, in_flight: %d, snd_cwnd: %d\n", in_flight, tp->snd_cwnd); + return true; + } + + /* Now, check if what is queued in the subflow's send-queue + * already fills the cwnd. + */ + space = (tp->snd_cwnd - in_flight) * tp->mss_cache; + + if (tp->write_seq - tp->snd_nxt > space) { + //printk("shivanga: no space 1\n"); + return true; + } + + if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp))) { + //printk("shivanga: no space 2\n"); + return true; + } + + mss_now = tcp_current_mss(sk); + + /* Don't send on this subflow if we bypass the allowed send-window at + * the per-subflow level. Similar to tcp_snd_wnd_test, but manually + * calculated end_seq (because here at this point end_seq is still at + * the meta-level). + */ + if (skb && !zero_wnd_test && + after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp))) { + //printk("shivanga: passed allowed snd wnd\n"); + return true; + } + + return false; +} + +/* Are we not allowed to reinject this skb on tp? */ +static int mptcp_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb) +{ + /* If the skb has already been enqueued in this sk, try to find + * another one. + */ + return skb && + /* Has the skb already been enqueued into this subsocket? */ + mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask; +} + +/* Generic function to iterate over used and unused subflows and to select the + * best one + */ +#ifdef MPTCP_SCHED_PROBE +static struct sock +*rtt_get_subflow_from_selectors(struct mptcp_cb *mpcb, struct sk_buff *skb, + bool (*selector)(const struct tcp_sock *), + bool zero_wnd_test, bool *force, unsigned long sched_probe_id) +#else +static struct sock +*rtt_get_subflow_from_selectors(struct mptcp_cb *mpcb, struct sk_buff *skb, + bool (*selector)(const struct tcp_sock *), + bool zero_wnd_test, bool *force) +#endif +{ + struct sock *bestsk = NULL; + u32 min_srtt = 0xffffffff; + bool found_unused = false; + bool found_unused_una = false; + //struct sock *sk; + struct mptcp_tcp_sock *mptcp; +#ifdef MPTCP_SCHED_PROBE + struct mptcp_sched_probe sprobe; +#endif + + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + struct tcp_sock *tp = tcp_sk(mptcp_to_sock(mptcp)); + bool unused = false; +#ifdef MPTCP_SCHED_PROBE + mptcp_sched_probe_init(&sprobe); +#endif + + /* First, we choose only the wanted sks */ + if (!(*selector)(tp)) { +#ifdef MPTCP_SCHED_PROBE + sprobe.selector_reject = true; + mptcp_sched_probe_log_hook(&sprobe, false, sched_probe_id, sk); +#endif + continue; + } + + if (!mptcp_dont_reinject_skb(tp, skb)) + unused = true; + else if (found_unused) { +#ifdef MPTCP_SCHED_PROBE + sprobe.found_unused_reject = true; + mptcp_sched_probe_log_hook(&sprobe, false, sched_probe_id, sk); +#endif + /* If a unused sk was found previously, we continue - + * no need to check used sks anymore. + */ + continue; + } + + if (mptcp_is_def_unavailable(sk)) { +#ifdef MPTCP_SCHED_PROBE + sprobe.def_unavailable = true; + mptcp_sched_probe_log_hook(&sprobe, false, sched_probe_id, sk); +#endif + continue; + } + + if (mptcp_is_temp_unavailable(sk, skb, zero_wnd_test)) { + if (unused) + found_unused_una = true; +#ifdef MPTCP_SCHED_PROBE + sprobe.temp_unavailable = true; + mptcp_sched_probe_log_hook(&sprobe, false, sched_probe_id, sk); +#endif + continue; + } + + if (unused) { + if (!found_unused) { + /* It's the first time we encounter an unused + * sk - thus we reset the bestsk (which might + * have been set to a used sk). + */ + min_srtt = 0xffffffff; + bestsk = NULL; + } + found_unused = true; + } + + if (tp->srtt_us < min_srtt) { + min_srtt = tp->srtt_us; + bestsk = sk; + } +#ifdef MPTCP_SCHED_PROBE + else { + sprobe.srtt_reject = true; + mptcp_sched_probe_log_hook(&sprobe, false, sched_probe_id, sk); + } +#endif + } + + if (bestsk) { + /* The force variable is used to mark the returned sk as + * previously used or not-used. + */ + if (found_unused) + *force = true; + else + *force = false; + } else { + /* The force variable is used to mark if there are temporally + * unavailable not-used sks. + */ + if (found_unused_una) + *force = true; + else + *force = false; + } + +#ifdef MPTCP_SCHED_PROBE + mptcp_sched_probe_init(&sprobe); + if(bestsk) { + //sprobe.skblen = DIV_ROUND_UP(skb->len, tcp_current_mss(bestsk)); + mptcp_sched_probe_log_hook(&sprobe, true, sched_probe_id, bestsk); + } +#endif + return bestsk; +} + +/* This is the scheduler. This function decides on which flow to send + * a given MSS. If all subflows are found to be busy, NULL is returned + * The flow is selected based on the shortest RTT. + * If all paths have full cong windows, we simply return NULL. + * + * Additionally, this function is aware of the backup-subflows. + */ +struct sock *rtt_get_available_subflow(struct sock *meta_sk, struct sk_buff *skb, + bool zero_wnd_test) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + /*Phuc*/ + struct sock *sk=NULL; + struct mptcp_tcp_sock *mptcp; + /****/ + bool force; +#ifdef MPTCP_SCHED_PROBE + struct mptcp_sched_probe sprobe; + unsigned long sched_probe_id; + + mptcp_sched_probe_init(&sprobe); + get_random_bytes(&sched_probe_id, sizeof(sched_probe_id)); +#endif + + /* if there is only one subflow, bypass the scheduling function */ + if (mpcb->cnt_subflows == 1) { + sk = (struct sock *)mpcb->connection_list; + if (!mptcp_is_available(sk, skb, zero_wnd_test)) + sk = NULL; +#ifdef MPTCP_SCHED_PROBE + if(sk) { + //sprobe.skblen = DIV_ROUND_UP(skb->len, tcp_current_mss(sk)); + mptcp_sched_probe_log_hook(&sprobe, true, sched_probe_id, sk); + } +#endif + return sk; + } + + /* Answer data_fin on same subflow!!! */ + if (meta_sk->sk_shutdown & RCV_SHUTDOWN && + skb && mptcp_is_data_fin(skb)) { + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index && + mptcp_is_available(sk, skb, zero_wnd_test)) { +#ifdef MPTCP_SCHED_PROBE + if(sk) { + //sprobe.skblen = DIV_ROUND_UP(skb->len, tcp_current_mss(sk)); + mptcp_sched_probe_log_hook(&sprobe, true, sched_probe_id, sk); + } +#endif + return sk; + } + } + } + +#ifdef MPTCP_SCHED_PROBE + /* Find the best subflow */ + sk = rtt_get_subflow_from_selectors(mpcb, skb, &subflow_is_active, + zero_wnd_test, &force, sched_probe_id); + if (force) + /* one unused active sk or one NULL sk when there is at least + * one temporally unavailable unused active sk + */ + return sk; + + sk = rtt_get_subflow_from_selectors(mpcb, skb, &subflow_is_backup, + zero_wnd_test, &force, sched_probe_id); +#else + /* Find the best subflow */ + sk = rtt_get_subflow_from_selectors(mpcb, skb, &subflow_is_active, + zero_wnd_test, &force); + if (force) + /* one unused active sk or one NULL sk when there is at least + * one temporally unavailable unused active sk + */ + return sk; + + sk = rtt_get_subflow_from_selectors(mpcb, skb, &subflow_is_backup, + zero_wnd_test, &force); +#endif + if (!force && skb) + /* one used backup sk or one NULL sk where there is no one + * temporally unavailable unused backup sk + * + * the skb passed through all the available active and backups + * sks, so clean the path mask + */ + TCP_SKB_CB(skb)->path_mask = 0; + return sk; +} + +static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal) +{ + struct sock *meta_sk; + const struct tcp_sock *tp = tcp_sk(sk); + //struct tcp_sock *tp_it; + struct mptcp_tcp_sock *mptcp; + struct sk_buff *skb_head; + struct defsched_priv *dsp = defsched_get_priv(tp); + + if (tp->mpcb->cnt_subflows == 1) + return NULL; + + if (sysctl_mptcp_scheduler_optimizations_disabled > 2) return NULL; + + meta_sk = mptcp_meta_sk(sk); + skb_head = tcp_write_queue_head(meta_sk); + + if (!skb_head || skb_head == tcp_send_head(meta_sk)) + return NULL; + + /* If penalization is optional (coming from mptcp_next_segment() and + * We are not send-buffer-limited we do not penalize. The retransmission + * is just an optimization to fix the idle-time due to the delay before + * we wake up the application. + */ + if (!penal && sk_stream_memory_free(meta_sk)) + goto retrans; + + /* Only penalize again after an RTT has elapsed */ + if (tcp_jiffies32 - dsp->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3)) + goto retrans; + + if (sysctl_mptcp_scheduler_optimizations_disabled > 1) + goto retrans; + + /* Half the cwnd of the slow flow */ + mptcp_for_each_sub(tp->mpcb, mptcp) { + struct tcp_sock *tp_it = mptcp->tp; + if (tp_it != tp && + TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { + if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) { + u32 prior_cwnd = tp_it->snd_cwnd; + + if (sysctl_mptcp_scheduler_optimizations_disabled && tcp_in_slow_start(tp_it)) + continue; + + tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U); + + /* If in slow start, do not reduce the ssthresh */ + if (prior_cwnd >= tp_it->snd_ssthresh) + tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U); + + dsp->last_rbuf_opti = tcp_jiffies32; + } + break; + } + } + +retrans: + + /* Segment not yet injected into this path? Take it!!! */ + if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) { + bool do_retrans = false; + mptcp_for_each_sub(tp->mpcb, mptcp) { + struct tcp_sock *tp_it = mptcp->tp; + if (tp_it != tp && + TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { + if (tp_it->snd_cwnd <= 4) { + do_retrans = true; + break; + } + + if (4 * tp->srtt_us >= tp_it->srtt_us) { + do_retrans = false; + break; + } else { + do_retrans = true; + } + } + } + + if (do_retrans && mptcp_is_available(sk, skb_head, false)) + return skb_head; + } + return NULL; +} + +/* Returns the next segment to be sent from the mptcp meta-queue. + * (chooses the reinject queue if any segment is waiting in it, otherwise, + * chooses the normal write queue). + * Sets *@reinject to 1 if the returned segment comes from the + * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, + * and sets it to -1 if it is a meta-level retransmission to optimize the + * receive-buffer. + */ +static struct sk_buff *__mptcp_next_segment(struct sock *meta_sk, int *reinject) +{ + const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct sk_buff *skb = NULL; + + *reinject = 0; + + /* If we are in fallback-mode, just take from the meta-send-queue */ + if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) + return tcp_send_head(meta_sk); + + skb = skb_peek(&mpcb->reinject_queue); + + if (skb) { + *reinject = 1; + } else { + skb = tcp_send_head(meta_sk); + + if (!skb && meta_sk->sk_socket && + test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) && + sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) { + struct sock *subsk = rtt_get_available_subflow(meta_sk, NULL, + false); + if (!subsk) + return NULL; + + skb = mptcp_rcv_buf_optimization(subsk, 0); + if (skb) + *reinject = -1; + } + } + return skb; +} + +#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \ + do { \ + si4.sin_family = AF_INET; \ + si4.sin_port = inet->inet_##mem##port; \ + si4.sin_addr.s_addr = inet->inet_##mem##addr; \ + } while (0) \ + +static struct sk_buff *mptcp_rtt_next_segment(struct sock *meta_sk, + int *reinject, + struct sock **subsk, + unsigned int *limit) +{ + struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject); + unsigned int mss_now; + struct tcp_sock *subtp; + u16 gso_max_segs; + u32 max_len, max_segs, window, needed; + //shivanga + /*struct inet_sock *inet; + union { + struct sockaddr raw; + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } dst;*/ +#ifdef MPTCP_SCHED_PROBE + struct mptcp_sched_probe sprobe; + unsigned long sched_probe_id = ULONG_MAX; + + mptcp_sched_probe_init(&sprobe); + //get_random_bytes(&sched_probe_id, sizeof(sched_probe_id)); +#endif + //if (inet==NULL) printk("shivanga: inet null\n"); + /* As we set it, we have to reset it as well. */ + *limit = 0; + + if (!skb) + return NULL; + + *subsk = rtt_get_available_subflow(meta_sk, skb, false); + if (!*subsk) + return NULL; + + //inet = inet_sk(*subsk); + subtp = tcp_sk(*subsk); + mss_now = tcp_current_mss(*subsk); + + if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) { + skb = mptcp_rcv_buf_optimization(*subsk, 1); + if (skb) + *reinject = -1; + else + return NULL; + } + + /* No splitting required, as we will only send one single segment */ + if (skb->len <= mss_now) + return skb; + + /* The following is similar to tcp_mss_split_point, but + * we do not care about nagle, because we will anyways + * use TCP_NAGLE_PUSH, which overrides this. + * + * So, we first limit according to the cwnd/gso-size and then according + * to the subflow's window. + */ + + gso_max_segs = (*subsk)->sk_gso_max_segs; + if (!gso_max_segs) /* No gso supported on the subflow's NIC */ + gso_max_segs = 1; + max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs); + if (!max_segs) + return NULL; + + max_len = mss_now * max_segs; + window = tcp_wnd_end(subtp) - subtp->write_seq; + + needed = min(skb->len, window); + if (max_len <= skb->len) + /* Take max_win, which is actually the cwnd/gso-size */ + *limit = max_len; + else + /* Or, take the window */ + *limit = needed; + //tcp_probe_copy_fl_to_si4(inet, dst.v4, d); + //printk("shivanga: %pISpc\n",&dst); + //printk("%d %d %d\n",DIV_ROUND_UP(*limit, mss_now), DIV_ROUND_UP(skb->len, mss_now), DIV_ROUND_UP(window,mss_now)); + +#ifdef MPTCP_SCHED_PROBE + if (*subsk) { + sprobe.split = DIV_ROUND_UP(*limit, mss_now); + sprobe.skblen = DIV_ROUND_UP(skb->len, mss_now); + mptcp_sched_probe_log_hook(&sprobe, true, sched_probe_id, *subsk); + } +#endif + return skb; +} + +static void rtt_init(struct sock *sk) +{ + struct defsched_priv *dsp = defsched_get_priv(tcp_sk(sk)); + + dsp->last_rbuf_opti = tcp_jiffies32; +} + +static struct mptcp_sched_ops mptcp_sched_rtt = { + .get_subflow = rtt_get_available_subflow, + .next_segment = mptcp_rtt_next_segment, + .init = rtt_init, + .name = "rtt", + .owner = THIS_MODULE, +}; + +static int __init rtt_register(void) +{ + BUILD_BUG_ON(sizeof(struct defsched_priv) > MPTCP_SCHED_SIZE); + + if (mptcp_register_scheduler(&mptcp_sched_rtt)) + return -1; + + return 0; +} + +static void rtt_unregister(void) +{ + mptcp_unregister_scheduler(&mptcp_sched_rtt); +} + +module_init(rtt_register); +module_exit(rtt_unregister); + +MODULE_AUTHOR("Swetank Kumar Saha"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RTT MPTCP"); +MODULE_VERSION("0.01"); diff --git a/net/mptcp/mptcp_sched_probe.c b/net/mptcp/mptcp_sched_probe.c new file mode 100644 index 0000000000000..b4d685bd9bda3 --- /dev/null +++ b/net/mptcp/mptcp_sched_probe.c @@ -0,0 +1,347 @@ +/* + * mptcp_sched_probe - Observe the MPTCP scheduler with kprobes. + * + * The idea for this came from Werner Almesberger's umlsim + * Copyright (C) 2004, Stephen Hemminger + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Swetank Kumar Saha "); +MODULE_DESCRIPTION("MPTCP scheduler snooper"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0"); + +static int port __read_mostly; +MODULE_PARM_DESC(port, "Port to match (0=all)"); +module_param(port, int, 0); + +static unsigned int bufsize __read_mostly = 4096; +MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); +module_param(bufsize, uint, 0); + +static const char procname[] = "mptcp_sched_probe"; + +struct tcp_log { + unsigned long id; + ktime_t tstamp; + union { + struct sockaddr raw; + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } src, dst; + u16 length; + u32 snd_nxt; + u32 snd_una; + u32 snd_wnd; + u32 rcv_wnd; + u32 snd_cwnd; + u32 ssthresh; + u32 srtt; + + u32 rttvar_us; + u32 rto; + u32 mss_cache; + u32 packets_in_flight; + u32 retrans_out; + u32 total_retrans; + u32 rack_rtt_us; + u8 rack_reord; + u32 snd_cwnd_clamp; + u32 snd_cwnd_used; + u32 lost_out; + bool is_cwnd_limited; + u32 rate_delivered; + u32 rate_interval_us; + + bool selector_reject; + bool found_unused_reject; + bool def_unavailable; + bool temp_unavailable; + bool srtt_reject; + bool selected; + int split; + int skblen; + u32 tx_bytes; + u32 trans_start; +}; + +static struct { + spinlock_t lock; + wait_queue_head_t wait; + ktime_t start; + + unsigned long head, tail; + struct tcp_log *log; +} tcp_probe; + +static inline int tcp_probe_used(void) +{ + return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1); +} + +static inline int tcp_probe_avail(void) +{ + return bufsize - tcp_probe_used() - 1; +} + +#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \ + do { \ + si4.sin_family = AF_INET; \ + si4.sin_port = inet->inet_##mem##port; \ + si4.sin_addr.s_addr = inet->inet_##mem##addr; \ + } while (0) \ + +static void log_tcp_params(struct mptcp_sched_probe* sprobe) +{ + struct sock* sk = sprobe->sk; + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + + if(!sk) return; + + /* Only update if port or skb mark matches */ + if (((port == 0) || + ntohs(inet->inet_dport) == port || + ntohs(inet->inet_sport) == port )) { + spin_lock(&tcp_probe.lock); + /* If log fills, just silently drop*/ + if (tcp_probe_avail() > 1){ + struct tcp_log *p = tcp_probe.log + tcp_probe.head; + + p->tstamp = ktime_get(); + switch (sk->sk_family) { + case AF_INET: + tcp_probe_copy_fl_to_si4(inet, p->src.v4, s); + tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d); + break; + case AF_INET6: + memset(&p->src.v6, 0, sizeof(p->src.v6)); + memset(&p->dst.v6, 0, sizeof(p->dst.v6)); +#if IS_ENABLED(CONFIG_IPV6) + p->src.v6.sin6_family = AF_INET6; + p->src.v6.sin6_port = inet->inet_sport; + p->src.v6.sin6_addr = inet6_sk(sk)->saddr; + + p->dst.v6.sin6_family = AF_INET6; + p->dst.v6.sin6_port = inet->inet_dport; + p->dst.v6.sin6_addr = sk->sk_v6_daddr; +#endif + break; + default: + BUG(); + } + + p->length = 0;//p->length = skb->len; + p->snd_nxt = tp->snd_nxt; + p->snd_una = tp->snd_una; + p->snd_cwnd = tp->snd_cwnd; + p->snd_wnd = tp->snd_wnd; + p->rcv_wnd = tp->rcv_wnd; + p->ssthresh = tcp_current_ssthresh(sk); + p->srtt = tp->srtt_us >> 3; + + p->rttvar_us = tp->rttvar_us; + p->rto = __tcp_set_rto(tp);//see /include/net/tcp.h + p->mss_cache = tp->mss_cache; + p->packets_in_flight = tcp_packets_in_flight(tp);//see /include/net/tcp.h + p->retrans_out = tp->retrans_out; + p->total_retrans = tp->total_retrans; + p->rack_rtt_us = (tp->rack).rtt_us; + p->rack_reord = (tp->rack).reord; + p->snd_cwnd_clamp = tp->snd_cwnd_clamp; + p->snd_cwnd_used = tp->snd_cwnd_used; + p->lost_out = tp->lost_out; + p->is_cwnd_limited = tp->is_cwnd_limited; + p->rate_delivered = tp->rate_delivered; + p->rate_interval_us = tp->rate_interval_us; + p->id = sprobe->id; + p->selector_reject = sprobe->selector_reject; + p->found_unused_reject = sprobe->found_unused_reject; + p->def_unavailable = sprobe->def_unavailable; + p->temp_unavailable = sprobe->temp_unavailable; + p->srtt_reject = sprobe->srtt_reject; + p->selected = sprobe->selected; + p->split = sprobe->split; + p->skblen = sprobe->skblen; + p->tx_bytes = sprobe->tx_bytes; + p->trans_start = sprobe->trans_start; + + tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); + } + + spin_unlock(&tcp_probe.lock); + + wake_up(&tcp_probe.wait); + } +} + +static int kmptcp_sched_probe_log_hook(struct kretprobe_instance *ri, struct pt_regs *regs) { + struct mptcp_sched_probe *sprobe; + + sprobe = (struct mptcp_sched_probe*) regs_return_value(regs); + log_tcp_params(sprobe); + return 0; +} + +static struct kretprobe mptcp_kprobe = { + .kp = { + .symbol_name = "mptcp_sched_probe_log_hook", + }, + .handler = kmptcp_sched_probe_log_hook, +}; + +static int tcpprobe_open(struct inode *inode, struct file *file) +{ + /* Reset (empty) log */ + spin_lock_bh(&tcp_probe.lock); + tcp_probe.head = tcp_probe.tail = 0; + tcp_probe.start = ktime_get(); + spin_unlock_bh(&tcp_probe.lock); + + return 0; +} + +static int tcpprobe_sprint(char *tbuf, int n) +{ + const struct tcp_log *p + = tcp_probe.log + tcp_probe.tail; + struct timespec tv + = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); + + return scnprintf(tbuf, n, + "%lu %lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u %u %u %u %u %u %u %u %x %u %u %u %x %x %x %x %x %x %x %d %d %u %u %u %u\n", + p->id, + (unsigned long)tv.tv_sec, + (unsigned long)tv.tv_nsec, + &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, + p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd, + p->rttvar_us, p->rto, p->mss_cache, p->packets_in_flight, + p->retrans_out, p->total_retrans, p->rack_rtt_us, p->rack_reord, + p->snd_cwnd_clamp, p->snd_cwnd_used, p->lost_out, p->is_cwnd_limited, + p->selector_reject, p->found_unused_reject, p->def_unavailable, p->temp_unavailable, p->srtt_reject, + p->selected, p->split, p->skblen, p->tx_bytes, p->trans_start, p->rate_delivered, p->rate_interval_us); +} + +static ssize_t tcpprobe_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + int error = 0; + size_t cnt = 0; + + if (!buf) + return -EINVAL; + + while (cnt < len) { + char tbuf[256]; + int width; + + /* Wait for data in buffer */ + error = wait_event_interruptible(tcp_probe.wait, + tcp_probe_used() > 0); + if (error) + break; + + spin_lock_bh(&tcp_probe.lock); + if (tcp_probe.head == tcp_probe.tail) { + /* multiple readers race? */ + spin_unlock_bh(&tcp_probe.lock); + continue; + } + + width = tcpprobe_sprint(tbuf, sizeof(tbuf)); + + if (cnt + width < len) + tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1); + + spin_unlock_bh(&tcp_probe.lock); + + /* if record greater than space available + return partial buffer (so far) */ + if (cnt + width >= len) + break; + + if (copy_to_user(buf + cnt, tbuf, width)) + return -EFAULT; + cnt += width; + } + + return cnt == 0 ? error : cnt; +} + +static const struct file_operations tcpprobe_fops = { + .owner = THIS_MODULE, + .open = tcpprobe_open, + .read = tcpprobe_read, + .llseek = noop_llseek, +}; + +static __init int tcpprobe_init(void) +{ + int ret = -ENOMEM; + + init_waitqueue_head(&tcp_probe.wait); + spin_lock_init(&tcp_probe.lock); + + if (bufsize == 0) + return -EINVAL; + + bufsize = roundup_pow_of_two(bufsize); + tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL); + if (!tcp_probe.log) + goto err0; + + if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops)) + goto err0; + + ret = register_kretprobe(&mptcp_kprobe); + + if (ret) + goto err1; + + pr_info("mptcp_sched_probe registered (port=%d) bufsize=%u\n", + port, bufsize); + return 0; + err1: + remove_proc_entry(procname, init_net.proc_net); + err0: + kfree(tcp_probe.log); + return ret; +} +module_init(tcpprobe_init); + +static __exit void tcpprobe_exit(void) +{ + remove_proc_entry(procname, init_net.proc_net); + unregister_kretprobe(&mptcp_kprobe); + kfree(tcp_probe.log); +} +module_exit(tcpprobe_exit); diff --git a/net/mptcp/mptcp_sched_probe_log_hook.c b/net/mptcp/mptcp_sched_probe_log_hook.c new file mode 100644 index 0000000000000..0309f688f229c --- /dev/null +++ b/net/mptcp/mptcp_sched_probe_log_hook.c @@ -0,0 +1,28 @@ +#include + +void mptcp_sched_probe_init(struct mptcp_sched_probe *sprobe) +{ + sprobe->id = 0; + sprobe->sk = NULL; + sprobe->selector_reject = false; + sprobe->found_unused_reject = false; + sprobe->def_unavailable = false; + sprobe->temp_unavailable = false; + sprobe->srtt_reject = false; + sprobe->selected = false; + sprobe->split = 0; + sprobe->skblen = 0; + sprobe->tx_bytes = 0; + sprobe->trans_start = 0; +} +EXPORT_SYMBOL_GPL(mptcp_sched_probe_init); + +/* This exists only for kretprobe to hook on to and read sprobe */ +noinline struct mptcp_sched_probe* mptcp_sched_probe_log_hook(struct mptcp_sched_probe* sprobe, bool selected, unsigned long sched_probe_id, struct sock *sk) { + sprobe->selected = selected; + sprobe->id = sched_probe_id; + sprobe->sk = sk; + + return sprobe; +} +EXPORT_SYMBOL_GPL(mptcp_sched_probe_log_hook);