From 6777ac7052a0b5f3347900e60a1f0bfa0a401955 Mon Sep 17 00:00:00 2001 From: Tomas Hruby Date: Thu, 14 Mar 2024 16:10:17 -0700 Subject: [PATCH] [BPF] When lo has IP override it when CTLB is disabled When CTLB is disabled, we route traffic for services via bpfnatin/out device. Since the final destination isn't resolved yet, Linux picks up an address set on loopback device (if there is any) as source. This may not be (likely is not) an address that can be used by the destination to return traffic. Therefore we need to replace it by the host's IP that is routable within the cluster. We use the same mechanism as for replacing main host device IP with a tunnel IP when we need to reach a remote workload via an overlay. --- felix/bpf-gpl/tc.c | 42 +++++++++++++++++++++++++++++------------ felix/fv/bpf_test.go | 45 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 13 deletions(-) diff --git a/felix/bpf-gpl/tc.c b/felix/bpf-gpl/tc.c index 750be3abbc5..baa3a17e08b 100644 --- a/felix/bpf-gpl/tc.c +++ b/felix/bpf-gpl/tc.c @@ -445,7 +445,12 @@ static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx) } } - if (CALI_F_TO_WEP && !skb_seen(ctx->skb) && + if (CALI_F_TO_WEP && + /* We have not seen the packet yet, must originate from the host. */ + (!skb_seen(ctx->skb) || + /* We have seen the packet, but was looped via NAT iface and so it must be from the host. */ + skb_mark_equals(ctx->skb, CALI_SKB_MARK_FROM_NAT_IFACE_OUT, CALI_SKB_MARK_FROM_NAT_IFACE_OUT)) && + /* Double check that it has host source IP - do it last, it is the most expensive test. */ cali_rt_flags_local_host(cali_rt_lookup_flags(&ctx->state->ip_src))) { /* Host to workload traffic always allowed. We discount traffic that was * seen by another program since it must have come in via another interface. @@ -1329,17 +1334,30 @@ int calico_tc_skb_new_flow_entrypoint(struct __sk_buff *skb) if ((CALI_F_TO_HOST && CALI_F_NAT_IF) || (CALI_F_TO_HEP && (CALI_F_LO || CALI_F_MAIN))) { struct cali_rt *r = cali_rt_lookup(&state->post_nat_ip_dst); - if (r && cali_rt_flags_remote_workload(r->flags) && cali_rt_is_tunneled(r)) { - CALI_DEBUG("remote wl %x tunneled via %x\n", - debug_ip(state->post_nat_ip_dst), debug_ip(HOST_TUNNEL_IP)); - ct_ctx_nat->src = HOST_TUNNEL_IP; - /* This would be the place to set a new source port if we - * had a way how to allocate it. Instead we rely on source - * port collision resolution. - * ct_ctx_nat->sport = 10101; - */ - state->ct_result.nat_sip = ct_ctx_nat->src; - state->ct_result.nat_sport = ct_ctx_nat->sport; + if (r) { + if (cali_rt_flags_remote_workload(r->flags) && cali_rt_is_tunneled(r)) { + CALI_DEBUG("remote wl %x tunneled via %x\n", + debug_ip(state->post_nat_ip_dst), debug_ip(HOST_TUNNEL_IP)); + ct_ctx_nat->src = HOST_TUNNEL_IP; + /* This would be the place to set a new source port if we + * had a way how to allocate it. Instead we rely on source + * port collision resolution. + * ct_ctx_nat->sport = 10101; + */ + state->ct_result.nat_sip = ct_ctx_nat->src; + state->ct_result.nat_sport = ct_ctx_nat->sport; + } else if (!cali_rt_is_local(r) && !ip_equal(state->ip_src, HOST_IP)) { + CALI_DEBUG("remote wl %x fixing unexpected IP from lo %x\n", + debug_ip(state->post_nat_ip_dst), debug_ip(HOST_IP)); + ct_ctx_nat->src = HOST_IP; + /* This would be the place to set a new source port if we + * had a way how to allocate it. Instead we rely on source + * port collision resolution. + * ct_ctx_nat->sport = 10101; + */ + state->ct_result.nat_sip = ct_ctx_nat->src; + state->ct_result.nat_sport = ct_ctx_nat->sport; + } } } diff --git a/felix/fv/bpf_test.go b/felix/fv/bpf_test.go index 9803c55fea2..97c93830651 100644 --- a/felix/fv/bpf_test.go +++ b/felix/fv/bpf_test.go @@ -1209,11 +1209,13 @@ func describeBPFTests(opts ...bpfTestOpt) bool { clusterIP := "10.101.0.10" extIP := "10.1.2.3" excludeSvcIP := "10.101.0.222" + loIP := "5.6.5.6" if testOpts.ipv6 { clusterIP = "dead:beef::abcd:0:0:10" extIP = "dead:beef::abcd:1:2:3" excludeSvcIP = "dead:beef::abcd:0:0:222" + loIP = "dead:beef::abcd:0:5656:5656" } if testOpts.protocol == "udp" && testOpts.udpUnConnected { @@ -3215,7 +3217,6 @@ func describeBPFTests(opts ...bpfTestOpt) bool { cc.Expect(Some, hostW[0], TargetIP(node1IP), ports, hostW0SrcIP) cc.Expect(Some, hostW[1], TargetIP(node0IP), ports, hostW1SrcIP) cc.Expect(Some, hostW[1], TargetIP(node1IP), ports, hostW1SrcIP) - cc.CheckConnectivity() }) @@ -3476,6 +3477,48 @@ func describeBPFTests(opts ...bpfTestOpt) bool { cc.CheckConnectivity() }) + + It("should have connectivity from all host-networked workloads to workload 0 "+ + "via clusterIP with non-routable address set on lo", func() { + // It only makes sense for turned off CTLB as with CTLB routing + // picks the right source IP. + if testOpts.connTimeEnabled { + return + } + By("Configuring ip on lo") + tc.Felixes[0].Exec("ip", "addr", "add", loIP+"/"+ipMask(), "dev", "lo") + tc.Felixes[1].Exec("ip", "addr", "add", loIP+"/"+ipMask(), "dev", "lo") + + By("testing connectivity") + + node1IP := felixIP(1) + hostW0SrcIP := ExpectWithSrcIPs(loIP) + hostW1SrcIP := ExpectWithSrcIPs(node1IP) + + switch testOpts.tunnel { + case "ipip": + hostW1SrcIP = ExpectWithSrcIPs(tc.Felixes[1].ExpectedIPIPTunnelAddr) + case "wireguard": + if testOpts.ipv6 { + hostW1SrcIP = ExpectWithSrcIPs(tc.Felixes[1].ExpectedWireguardV6TunnelAddr) + } else { + hostW1SrcIP = ExpectWithSrcIPs(tc.Felixes[1].ExpectedWireguardTunnelAddr) + } + case "vxlan": + if testOpts.ipv6 { + hostW1SrcIP = ExpectWithSrcIPs(tc.Felixes[1].ExpectedVXLANV6TunnelAddr) + } else { + hostW1SrcIP = ExpectWithSrcIPs(tc.Felixes[1].ExpectedVXLANTunnelAddr) + } + } + clusterIP := testSvc.Spec.ClusterIP + ports := ExpectWithPorts(uint16(testSvc.Spec.Ports[0].Port)) + + cc.Expect(Some, hostW[0], TargetIP(clusterIP), ports, hostW0SrcIP) + cc.Expect(Some, hostW[1], TargetIP(clusterIP), ports, hostW1SrcIP) + + cc.CheckConnectivity() + }) }) }