diff --git a/ipamd/introspect.go b/ipamd/introspect.go index 1bbf9b9d61..d83276cbd8 100644 --- a/ipamd/introspect.go +++ b/ipamd/introspect.go @@ -23,6 +23,7 @@ import ( log "github.com/cihub/seelog" "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/aws/amazon-vpc-cni-k8s/pkg/networkutils" "github.com/aws/amazon-vpc-cni-k8s/pkg/utils" ) @@ -68,8 +69,9 @@ func (c *IPAMContext) SetupHTTP() { func (c *IPAMContext) setupServer() *http.Server { serverFunctions := map[string]func(w http.ResponseWriter, r *http.Request){ - "/v1/enis": eniV1RequestHandler(c), - "/v1/pods": podV1RequestHandler(c), + "/v1/enis": eniV1RequestHandler(c), + "/v1/pods": podV1RequestHandler(c), + "/v1/env-settings": envV1RequestHandler(c), } paths := make([]string, 0, len(serverFunctions)) for path := range serverFunctions { @@ -132,6 +134,18 @@ func podV1RequestHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Requ } } +func envV1RequestHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Request) { + return func(w http.ResponseWriter, r *http.Request) { + responseJSON, err := json.Marshal(networkutils.GetConfigForDebug()) + if err != nil { + log.Error("Failed to marshal env var data: %v", err) + http.Error(w, http.StatusText(http.StatusInternalServerError), http.StatusInternalServerError) + return + } + w.Write(responseJSON) + } +} + func metricsHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Request) { return func(w http.ResponseWriter, r *http.Request) { promhttp.Handler() diff --git a/pkg/networkutils/network.go b/pkg/networkutils/network.go index 4b634b0254..6b45076806 100644 --- a/pkg/networkutils/network.go +++ b/pkg/networkutils/network.go @@ -14,6 +14,9 @@ package networkutils import ( + "fmt" + "io" + "math" "net" "os" "strconv" @@ -47,8 +50,26 @@ const ( // This environment is used to specify whether an external NAT gateway will be used to provide SNAT of // secondary ENI IP addresses. If set to "true", the SNAT iptables rule and off-VPC ip rule will not - // be installed and will be removed if they are already installed. + // be installed and will be removed if they are already installed. Defaults to false. envExternalSNAT = "AWS_VPC_K8S_CNI_EXTERNALSNAT" + + // envNodePortSupport is the name of environment variable that configures whether we implement support for + // NodePorts on the primary ENI. This requires that we add additional iptables rules and loosen the kernel's + // RPF check as described below. Defaults to true. + envNodePortSupport = "AWS_VPC_CNI_NODE_PORT_SUPPORT" + + // envConnmark is the name of the environment variable that overrides the default connection mark, used to + // mark traffic coming from the primary ENI so that return traffic can be forced out of the same interface. + // Without using a mark, NodePort DNAT and our source-based routing do not work together if the target pod + // behind the node port is not on the main ENI. In that case, the un-DNAT is done after the source-based + // routing, resulting in the packet being sent out of the pod's ENI, when the NodePort traffic should be + // sent over the main ENI. + envConnmark = "AWS_VPC_K8S_CNI_CONNMARK" + + // defaultConnmark is the default value for the connmark described above. Note: the mark space is a little crowded, + // - kube-proxy uses 0x0000c000 + // - Calico uses 0xffff0000. + defaultConnmark = 0x80 ) // NetworkAPIs defines the host level and the eni level network related operations @@ -60,14 +81,45 @@ type NetworkAPIs interface { } type linuxNetwork struct { - netLink netlinkwrapper.NetLink - ns nswrapper.NS + useExternalSNAT bool + nodePortSupportEnabled bool + connmark uint32 + + netLink netlinkwrapper.NetLink + ns nswrapper.NS + newIptables func() (iptablesIface, error) + mainENIMark uint32 + openFile func(name string, flag int, perm os.FileMode) (stringWriteCloser, error) +} + +type iptablesIface interface { + Exists(table, chain string, rulespec ...string) (bool, error) + Append(table, chain string, rulespec ...string) error + Delete(table, chain string, rulespec ...string) error } // New creates a linuxNetwork object func New() NetworkAPIs { - return &linuxNetwork{netLink: netlinkwrapper.NewNetLink(), - ns: nswrapper.NewNS()} + return &linuxNetwork{ + useExternalSNAT: useExternalSNAT(), + nodePortSupportEnabled: nodePortSupportEnabled(), + mainENIMark: getConnmark(), + + netLink: netlinkwrapper.NewNetLink(), + ns: nswrapper.NewNS(), + newIptables: func() (iptablesIface, error) { + ipt, err := iptables.New() + return ipt, err + }, + openFile: func(name string, flag int, perm os.FileMode) (stringWriteCloser, error) { + return os.OpenFile(name, flag, perm) + }, + } +} + +type stringWriteCloser interface { + io.Closer + WriteString(s string) (int, error) } func isDuplicateRuleAdd(err error) bool { @@ -76,17 +128,16 @@ func isDuplicateRuleAdd(err error) bool { // SetupHostNetwork performs node level network configuration // TODO : implement ip rule not to 10.0.0.0/16(vpc'subnet) table main priority 1024 -func (os *linuxNetwork) SetupHostNetwork(vpcCIDR *net.IPNet, primaryAddr *net.IP) error { - - externalSNAT := useExternalSNAT() - hostRule := os.netLink.NewRule() +func (n *linuxNetwork) SetupHostNetwork(vpcCIDR *net.IPNet, primaryAddr *net.IP) error { + log.Info("Setting up host network") + hostRule := n.netLink.NewRule() hostRule.Dst = vpcCIDR hostRule.Table = mainRoutingTable hostRule.Priority = hostRulePriority hostRule.Invert = true // If this is a restart, cleanup previous rule first - err := os.netLink.RuleDel(hostRule) + err := n.netLink.RuleDel(hostRule) if err != nil && !containsNoSuchRule(err) { log.Errorf("Failed to cleanup old host IP rule: %v", err) return errors.Wrapf(err, "host network setup: failed to delete old host rule") @@ -94,47 +145,143 @@ func (os *linuxNetwork) SetupHostNetwork(vpcCIDR *net.IPNet, primaryAddr *net.IP // Only include the rule if SNAT is not being handled by an external NAT gateway and needs to be // handled on-node. - if !externalSNAT { - err = os.netLink.RuleAdd(hostRule) + if !n.useExternalSNAT { + err = n.netLink.RuleAdd(hostRule) if err != nil { log.Errorf("Failed to add host IP rule: %v", err) return errors.Wrapf(err, "host network setup: failed to add host rule") } } - ipt, err := iptables.New() + if n.nodePortSupportEnabled { + // If node port support is enabled, configure the kernel's reverse path filter check on eth0 for "loose" + // filtering. This is required because + // - NodePorts are exposed on eth0 + // - The kernel's RPF check happens after incoming packets to NodePorts are DNATted to the pod IP. + // - For pods assigned to secondary ENIs, the routing table includes source-based routing. When the kernel does + // the RPF check, it looks up the route using the pod IP as the source. + // - Thus, it finds the source-based route that leaves via the secondary ENI. + // - In "strict" mode, the RPF check fails because the return path uses a different interface to the incoming + // packet. In "loose" mode, the check passes because some route was found. + const eth0RPFilter = "/proc/sys/net/ipv4/conf/eth0/rp_filter" + const rpFilterLoose = "2" + err := n.setProcSys(eth0RPFilter, rpFilterLoose) + if err != nil { + return errors.Wrapf(err, "failed to configure eth0 RPF check") + } + } - if err != nil { - return errors.Wrap(err, "host network setup: failed to create iptables") + // If node port support is enabled, add a rule that will force force marked traffic out of the main ENI. We then + // add iptables rules below that will mark traffic that needs this special treatment. In particular NodePort + // traffic always comes in via the main ENI but response traffic would go out of the pod's assigned ENI if we + // didn't handle it specially. This is because the routing decision is done before the NodePort's DNAT is + // reversed so, to the routing table, it looks like the traffic is pod traffic instead of NodePort traffic. + mainENIRule := n.netLink.NewRule() + mainENIRule.Mark = int(n.mainENIMark) + mainENIRule.Mask = int(n.mainENIMark) + mainENIRule.Table = mainRoutingTable + mainENIRule.Priority = hostRulePriority + // If this is a restart, cleanup previous rule first + err = n.netLink.RuleDel(mainENIRule) + if err != nil && !containsNoSuchRule(err) { + log.Errorf("Failed to cleanup old main ENI rule: %v", err) + return errors.Wrapf(err, "host network setup: failed to delete old main ENI rule") } - natCmd := []string{"!", "-d", vpcCIDR.String(), "-m", "comment", "--comment", "AWS, SNAT", - "-m", "addrtype", "!", "--dst-type", "LOCAL", "-j", "SNAT", "--to-source", primaryAddr.String()} - exists, err := ipt.Exists("nat", "POSTROUTING", natCmd...) + if n.nodePortSupportEnabled { + err = n.netLink.RuleAdd(mainENIRule) + if err != nil { + log.Errorf("Failed to add host main ENI rule: %v", err) + return errors.Wrapf(err, "host network setup: failed to add main ENI rule") + } + } + + ipt, err := n.newIptables() if err != nil { - return errors.Wrapf(err, "host network setup: failed to add POSTROUTING rule for primary address %s", primaryAddr) + return errors.Wrap(err, "host network setup: failed to create iptables") } - if !exists && !externalSNAT { - // We are handling SNAT on-node, so include the iptables SNAT POSTROUTING rule. - err = ipt.Append("nat", "POSTROUTING", natCmd...) - + for _, rule := range []iptablesRule{ + { + name: "connmark for primary ENI", + shouldExist: n.nodePortSupportEnabled, + table: "mangle", + chain: "PREROUTING", + rule: []string{ + "-m", "comment", "--comment", "AWS, primary ENI", + "-i", "eth0", + "-m", "addrtype", "--dst-type", "LOCAL", "--limit-iface-in", + "-j", "CONNMARK", "--set-mark", fmt.Sprintf("%#x/%#x", n.mainENIMark, n.mainENIMark), + }, + }, + { + name: "connmark restore for primary ENI", + shouldExist: n.nodePortSupportEnabled, + table: "mangle", + chain: "PREROUTING", + rule: []string{ + "-m", "comment", "--comment", "AWS, primary ENI", + "-i", "eni+", "-j", "CONNMARK", "--restore-mark", "--mask", fmt.Sprintf("%#x", n.mainENIMark), + }, + }, + { + name: fmt.Sprintf("rule for primary address %s", primaryAddr), + shouldExist: !n.useExternalSNAT, + table: "nat", + chain: "POSTROUTING", + rule: []string{ + "!", "-d", vpcCIDR.String(), + "-m", "comment", "--comment", "AWS, SNAT", + "-m", "addrtype", "!", "--dst-type", "LOCAL", + "-j", "SNAT", "--to-source", primaryAddr.String()}, + }, + } { + exists, err := ipt.Exists(rule.table, rule.chain, rule.rule...) if err != nil { - return errors.Wrapf(err, "host network setup: failed to append POSTROUTING rule for primary address %s", primaryAddr) + return errors.Wrapf(err, "host network setup: failed to check existence of %v", rule) } - } else if exists && externalSNAT { - // We are not handling SNAT on-node, so delete the existing iptables SNAT POSTROUTING rule. - err = ipt.Delete("nat", "POSTROUTING", natCmd...) - if err != nil { - return errors.Wrapf(err, "host network setup: failed to delete POSTROUTING rule for primary address %s", primaryAddr) + if !exists && rule.shouldExist { + err = ipt.Append(rule.table, rule.chain, rule.rule...) + if err != nil { + return errors.Wrapf(err, "host network setup: failed to add %v", rule) + } + } else if exists && !rule.shouldExist { + err = ipt.Delete(rule.table, rule.chain, rule.rule...) + if err != nil { + return errors.Wrapf(err, "host network setup: failed to delete %v", rule) + } } } return nil } +func (n *linuxNetwork) setProcSys(key, value string) error { + f, err := n.openFile(key, os.O_WRONLY, 0644) + if err != nil { + return err + } + defer f.Close() + _, err = f.WriteString(value) + if err != nil { + return err + } + return nil +} + +type iptablesRule struct { + name string + shouldExist bool + table, chain string + rule []string +} + +func (r iptablesRule) String() string { + return fmt.Sprintf("%s/%s rule %s", r.table, r.chain, r.name) +} + func containsNoSuchRule(err error) bool { if errno, ok := err.(syscall.Errno); ok { return errno == syscall.ENOENT @@ -142,19 +289,52 @@ func containsNoSuchRule(err error) bool { return false } +// GetConfigForDebug returns the active values of the configuration env vars (for debugging purposes). +func GetConfigForDebug() map[string]interface{} { + return map[string]interface{}{ + envExternalSNAT: useExternalSNAT(), + envNodePortSupport: nodePortSupportEnabled(), + envConnmark: getConnmark(), + } +} + // useExternalSNAT returns whether SNAT of secondary ENI IPs should be handled with an external // NAT gateway rather than on node. Failure to parse the setting will result in a log and the // setting will be disabled. func useExternalSNAT() bool { - if externalSNATStr := os.Getenv(envExternalSNAT); externalSNATStr != "" { - externalSNAT, err := strconv.ParseBool(externalSNATStr) + return getBoolEnvVar(envExternalSNAT, false) +} + +func nodePortSupportEnabled() bool { + return getBoolEnvVar(envNodePortSupport, true) +} + +func getBoolEnvVar(name string, defaultValue bool) bool { + if strValue := os.Getenv(name); strValue != "" { + parsedValue, err := strconv.ParseBool(strValue) if err != nil { - log.Error("Failed to parse "+envExternalSNAT, err.Error()) - return false + log.Error("Failed to parse "+name+"; using default: "+fmt.Sprint(defaultValue), err.Error()) + return defaultValue } - return externalSNAT + return parsedValue } - return false + return defaultValue +} + +func getConnmark() uint32 { + if connmark := os.Getenv(envConnmark); connmark != "" { + mark, err := strconv.ParseInt(connmark, 0, 64) + if err != nil { + log.Error("Failed to parse "+envConnmark+"; will use ", defaultConnmark, err.Error()) + return defaultConnmark + } + if mark > math.MaxUint32 || mark <= 0 { + log.Error(""+envConnmark+" out of range; will use ", defaultConnmark) + return defaultConnmark + } + return uint32(mark) + } + return defaultConnmark } // LinkByMac returns linux netlink based on interface MAC @@ -177,8 +357,8 @@ func LinkByMac(mac string, netLink netlinkwrapper.NetLink) (netlink.Link, error) } // SetupENINetwork adds default route to route table (eni-) -func (os *linuxNetwork) SetupENINetwork(eniIP string, eniMAC string, eniTable int, eniSubnetCIDR string) error { - return setupENINetwork(eniIP, eniMAC, eniTable, eniSubnetCIDR, os.netLink) +func (n *linuxNetwork) SetupENINetwork(eniIP string, eniMAC string, eniTable int, eniSubnetCIDR string) error { + return setupENINetwork(eniIP, eniMAC, eniTable, eniSubnetCIDR, n.netLink) } func setupENINetwork(eniIP string, eniMAC string, eniTable int, eniSubnetCIDR string, netLink netlinkwrapper.NetLink) error { diff --git a/pkg/networkutils/network_test.go b/pkg/networkutils/network_test.go index 7c22a8c187..ed778c39fd 100644 --- a/pkg/networkutils/network_test.go +++ b/pkg/networkutils/network_test.go @@ -14,7 +14,10 @@ package networkutils import ( + "errors" "net" + "os" + "reflect" "testing" "github.com/golang/mock/gomock" @@ -43,19 +46,26 @@ const ( testeniSubnet = "10.10.0.0/16" ) +var ( + _, testENINetIPNet, _ = net.ParseCIDR(testeniSubnet) + testENINetIP = net.ParseIP(testeniIP) +) + func setup(t *testing.T) (*gomock.Controller, *mock_netlinkwrapper.MockNetLink, *mocks_ip.MockIP, - *mock_nswrapper.MockNS) { + *mock_nswrapper.MockNS, + *mockIptables) { ctrl := gomock.NewController(t) return ctrl, mock_netlinkwrapper.NewMockNetLink(ctrl), mocks_ip.NewMockIP(ctrl), - mock_nswrapper.NewMockNS(ctrl) + mock_nswrapper.NewMockNS(ctrl), + newMockIptables() } func TestSetupENINetwork(t *testing.T) { - ctrl, mockNetLink, _, _ := setup(t) + ctrl, mockNetLink, _, _, _ := setup(t) defer ctrl.Finish() hwAddr, err := net.ParseMAC(testMAC1) @@ -97,9 +107,164 @@ func TestSetupENINetwork(t *testing.T) { } func TestSetupENINetworkPrimary(t *testing.T) { - ctrl, mockNetLink, _, _ := setup(t) + ctrl, mockNetLink, _, _, _ := setup(t) defer ctrl.Finish() err := setupENINetwork(testeniIP, testMAC2, 0, testeniSubnet, mockNetLink) assert.NoError(t, err) } + +func TestSetupHostNetworkNodePortDisabled(t *testing.T) { + ctrl, mockNetLink, _, mockNS, mockIptables := setup(t) + defer ctrl.Finish() + + ln := &linuxNetwork{ + mainENIMark: 0x80, + + netLink: mockNetLink, + ns: mockNS, + newIptables: func() (iptablesIface, error) { + return mockIptables, nil + }, + } + + var hostRule netlink.Rule + mockNetLink.EXPECT().NewRule().Return(&hostRule) + mockNetLink.EXPECT().RuleDel(&hostRule) + mockNetLink.EXPECT().RuleAdd(&hostRule) + var mainENIRule netlink.Rule + mockNetLink.EXPECT().NewRule().Return(&mainENIRule) + mockNetLink.EXPECT().RuleDel(&mainENIRule) + + err := ln.SetupHostNetwork(testENINetIPNet, &testENINetIP) + assert.NoError(t, err) + + assert.Equal(t, map[string]map[string][][]string{ + "nat": { + "POSTROUTING": [][]string{ + { + "!", "-d", testeniSubnet, + "-m", "comment", "--comment", "AWS, SNAT", + "-m", "addrtype", "!", "--dst-type", "LOCAL", + "-j", "SNAT", "--to-source", testeniIP, + }, + }, + }, + }, mockIptables.dataplaneState) +} + +func TestSetupHostNetworkNodePortEnabled(t *testing.T) { + ctrl, mockNetLink, _, mockNS, mockIptables := setup(t) + defer ctrl.Finish() + + var mockRPFilter mockFile + ln := &linuxNetwork{ + useExternalSNAT: true, + nodePortSupportEnabled: true, + mainENIMark: defaultConnmark, + + netLink: mockNetLink, + ns: mockNS, + newIptables: func() (iptablesIface, error) { + return mockIptables, nil + }, + openFile: func(name string, flag int, perm os.FileMode) (stringWriteCloser, error) { + return &mockRPFilter, nil + }, + } + + var hostRule netlink.Rule + mockNetLink.EXPECT().NewRule().Return(&hostRule) + mockNetLink.EXPECT().RuleDel(&hostRule) + var mainENIRule netlink.Rule + mockNetLink.EXPECT().NewRule().Return(&mainENIRule) + mockNetLink.EXPECT().RuleDel(&mainENIRule) + mockNetLink.EXPECT().RuleAdd(&mainENIRule) + + err := ln.SetupHostNetwork(testENINetIPNet, &testENINetIP) + assert.NoError(t, err) + + assert.Equal(t, map[string]map[string][][]string{ + "mangle": { + "PREROUTING": [][]string{ + { + "-m", "comment", "--comment", "AWS, primary ENI", + "-i", "eth0", + "-m", "addrtype", "--dst-type", "LOCAL", "--limit-iface-in", + "-j", "CONNMARK", "--set-mark", "0x80/0x80", + }, + { + "-m", "comment", "--comment", "AWS, primary ENI", + "-i", "eni+", "-j", "CONNMARK", "--restore-mark", "--mask", "0x80", + }, + }, + }, + }, mockIptables.dataplaneState) + assert.Equal(t, mockFile{closed: true, data: "2"}, mockRPFilter) +} + +type mockIptables struct { + // dataplaneState is a map from table name to chain name to slice of rulespecs + dataplaneState map[string]map[string][][]string +} + +func newMockIptables() *mockIptables { + return &mockIptables{dataplaneState: map[string]map[string][][]string{}} +} + +func (ipt *mockIptables) Exists(table, chainName string, rulespec ...string) (bool, error) { + chain := ipt.dataplaneState[table][chainName] + for _, r := range chain { + if reflect.DeepEqual(rulespec, r) { + return true, nil + } + } + return false, nil +} + +func (ipt *mockIptables) Append(table, chain string, rulespec ...string) error { + if ipt.dataplaneState[table] == nil { + ipt.dataplaneState[table] = map[string][][]string{} + } + ipt.dataplaneState[table][chain] = append(ipt.dataplaneState[table][chain], rulespec) + return nil +} + +func (ipt *mockIptables) Delete(table, chainName string, rulespec ...string) error { + chain := ipt.dataplaneState[table][chainName] + updatedChain := chain[:0] + found := false + for _, r := range chain { + if !found && reflect.DeepEqual(rulespec, r) { + found = true + continue + } + updatedChain = append(updatedChain, r) + } + if ! found { + return errors.New("not found") + } + ipt.dataplaneState[table][chainName] = updatedChain + return nil +} + +type mockFile struct { + closed bool + data string +} + +func (f *mockFile) WriteString(s string) (int, error) { + if f.closed { + panic("write call on closed file") + } + f.data += s + return len(s), nil +} + +func (f *mockFile) Close() error { + if f.closed { + panic("close call on closed file") + } + f.closed = true + return nil +} diff --git a/scripts/aws-cni-support.sh b/scripts/aws-cni-support.sh index 7673ae324d..74fd62fc27 100755 --- a/scripts/aws-cni-support.sh +++ b/scripts/aws-cni-support.sh @@ -21,8 +21,9 @@ set -e LOG_DIR="/var/log/aws-routed-eni" # collecting L-IPAMD introspection data -curl http://localhost:61678/v1/enis > ${LOG_DIR}/eni.output -curl http://localhost:61678/v1/pods > ${LOG_DIR}/pod.output +curl http://localhost:61678/v1/enis > ${LOG_DIR}/eni.output +curl http://localhost:61678/v1/pods > ${LOG_DIR}/pod.output +curl http://localhost:61678/v1/env-settings > ${LOG_DIR}/env.output # metrics TODO not able to use LOG_DIR curl http://localhost:61678/metrics 2>&1 > /var/log/aws-routed-eni/metrics.output @@ -58,4 +59,10 @@ echo "=============================================" >> ${LOG_DIR}/${ROUTE_OUTPU echo "ip route show table all" >> $LOG_DIR/$ROUTE_OUTPUT ip route show table all >> $LOG_DIR/$ROUTE_OUTPUT +# dump relevant sysctls +echo "================== sysctls ==================" > ${LOG_DIR}/sysctls.out +for f in /proc/sys/net/ipv4/conf/{all,default,eth0}/rp_filter; do + echo "$f = $(cat $f)" >> ${LOG_DIR}/sysctls.out +done + tar -cvzf $LOG_DIR/aws-cni-support.tar.gz ${LOG_DIR}/