-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathreproduce_a_leaked_dst_entry
More file actions
174 lines (145 loc) · 5.18 KB
/
reproduce_a_leaked_dst_entry
File metadata and controls
174 lines (145 loc) · 5.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
https://github.com/moby/moby/issues/5618
https://access.redhat.com/solutions/3659011
We've reproduced the same bug using a diagnostic kernel that had delays artificially inserted to make PMTU discovery exception routes hit this window.
1, Debugging kernel patche:
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a0163c5..6b9e7ee 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -133,6 +133,8 @@
static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
+static int ref_leak_test;
+
/*
* Interface to generic destination cache.
*/
@@ -1599,6 +1601,9 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
while (fnhe) {
if (fnhe->fnhe_daddr == daddr) {
+ if (ref_leak_test)
+ pr_info("XXX pid: %d, %s: fib_nh:%p, fnhe:%p, daddr:%x\n",
+ current->pid, __func__, nh, fnhe, daddr);
rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
fnhe_flush_routes(fnhe);
@@ -2145,10 +2150,14 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
fnhe = find_exception(nh, fl4->daddr);
if (fnhe) {
+ if (ref_leak_test)
+ pr_info("XXX pid: %d, found fnhe :%p\n", current->pid, fnhe);
prth = &fnhe->fnhe_rth_output;
rth = rcu_dereference(*prth);
if (rth && rth->dst.expires &&
time_after(jiffies, rth->dst.expires)) {
+ if (ref_leak_test)
+ pr_info("eXX pid: %d, del fnhe :%p\n", current->pid, fnhe);
ip_del_fnhe(nh, fl4->daddr);
fnhe = NULL;
} else {
@@ -2204,6 +2213,14 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
#endif
}
+ if (fnhe && ref_leak_test) {
+ unsigned long time_out;
+
+ time_out = jiffies + ref_leak_test;
+ while (time_before(jiffies, time_out))
+ cpu_relax();
+ pr_info("XXX pid: %d, reuse fnhe :%p\n", current->pid, fnhe);
+ }
rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
if (lwtunnel_output_redirect(rth->dst.lwtstate))
rth->dst.output = lwtunnel_output;
@@ -2733,6 +2750,13 @@ static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
.proc_handler = proc_dointvec,
},
{
+ .procname = "ref_leak_test",
+ .data = &ref_leak_test,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "max_size",
.data = &ip_rt_max_size,
.maxlen = sizeof(int),
2, User mode script:
# cat ref_leak_test_begin.sh
#!/bin/bash
# constructing a basic network with netns
# client <-->gateway <--> server
ip netns add svr
ip netns add gw
ip netns add cli
ip netns exec gw sysctl net.ipv4.ip_forward=1
ip link add svr-veth type veth peer name svrgw-veth
ip link add cli-veth type veth peer name cligw-veth
ip link set svr-veth netns svr
ip link set svrgw-veth netns gw
ip link set cligw-veth netns gw
ip link set cli-veth netns cli
ip netns exec svr ifconfig svr-veth 192.168.123.1
ip netns exec gw ifconfig svrgw-veth 192.168.123.254
ip netns exec gw ifconfig cligw-veth 10.0.123.254
ip netns exec cli ifconfig cli-veth 10.0.123.1
ip netns exec cli route add default gw 10.0.123.254
ip netns exec svr route add default gw 192.168.123.254
# constructing concurrently accessed scenes with nerperf
nohup ip netns exec svr netserver -L 192.168.123.1
nohup ip netns exec cli netperf -H 192.168.123.1 -l 300 &
nohup ip netns exec cli netperf -H 192.168.123.1 -l 300 &
nohup ip netns exec cli netperf -H 192.168.123.1 -l 300 &
nohup ip netns exec cli netperf -H 192.168.123.1 -l 300 &
# Add delay
echo 3000 > /proc/sys/net/ipv4/route/ref_leak_test
# making PMTU discovery exception routes
echo 1 > /proc/sys/net/ipv4/route/mtu_expires
for((i=1;i<=60;i++));
do
for j in 1400 1300 1100 1000
do
echo "set mtu to "$j;
ip netns exec svr ifconfig svr-veth mtu $j;
ip netns exec cli ifconfig cli-veth mtu $j;
ip netns exec gw ifconfig svrgw-veth mtu $j;
ip netns exec gw ifconfig cligw-veth mtu $j;
sleep 2;
done
done
# cat ref_leak_test_end.sh
#!/bin/bash
echo 0 > /proc/sys/net/ipv4/route/ref_leak_test
pkill netserver
pkill netperf
ip netns exec cli ifconfig cli-veth down
ip netns exec gw ifconfig svrgw-veth down
ip netns exec gw ifconfig cligw-veth down
ip netns exec svr ifconfig svr-veth down
ip netns del svr
ip netns del gw
ip netns del cli
3, The test process:
first load the debug kernel, then run ref_leak_test_begin.sh, wait a few seconds, run ref_leak_test_end.sh, and finally you can observe the error.
[root@iZuf6h1kfgutxc3el68z2lZ test]# bash ref_leak_test_begin.sh
net.ipv4.ip_forward = 1
nohup: ignoring input and appending output to ‘nohup.out’
nohup: set mtu to 1400
appending output to ‘nohup.out’
nohup: appending output to ‘nohup.out’
nohup: appending output to ‘nohup.out’
nohup: appending output to ‘nohup.out’
set mtu to 1300
set mtu to 1100
set mtu to 1000
set mtu to 1400
set mtu to 1300
set mtu to 1100
^C
[root@iZuf6h1kfgutxc3el68z2lZ test]# bash ref_leak_test_end.sh
[root@iZuf6h1kfgutxc3el68z2lZ test]#
Message from syslogd@iZuf6h1kfgutxc3el68z2lZ at Nov 4 20:29:43 ...
kernel:unregister_netdevice: waiting for cli-veth to become free. Usage count = 1
--
After some testing, torvalds/linux@ee60ad2 can indeed fix this bug.