-
Notifications
You must be signed in to change notification settings - Fork 749
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve CRI->checkpoint logic in the face of downgrades #1123
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,10 +52,28 @@ const ( | |
UnknownENIError = "datastore: unknown ENI" | ||
) | ||
|
||
// Enable temporary checkpoint upgrade logic. If checkpoint file is | ||
// not found, query CRI to learn current pods and currently-allocated | ||
// IPs. Will be disabled/removed after next release. | ||
const BackfillMissingCheckpointFromCRI = true | ||
// We need to know which IPs are already allocated across | ||
// ipamd/datastore restarts. In vpc-cni <=1.6, we "stored" the | ||
// allocated IPs by querying kubelet's CRI. Since this requires scary | ||
// access to CRI socket, and may race with CRI's internal logic, we | ||
// are transitioning away from from this to storing allocations | ||
// ourself in a file (similar to host-ipam CNI plugin). | ||
// | ||
// Because we don't want to require a node restart during CNI | ||
// upgrade/downgrade, we need an "expand/contract" style upgrade to | ||
// keep the two stores in sync: | ||
// | ||
// Migration phase0 (CNI 1.6): Read/write from CRI only. | ||
// Migration phase1 (CNI 1.7): Read from CRI. Write to CRI+file. | ||
// Migration phase2 (CNI 1.8?): Read from file. Write to CRI+file. | ||
// Migration phase3 (hypothetical): Read/write from file only. | ||
// | ||
// Note phase3 is not necessary since writes to CRI are implicit. | ||
// At/after phase2, we can remove any code protected by | ||
// CheckpointMigrationPhase<2. | ||
const CheckpointMigrationPhase = 1 | ||
|
||
// Placeholders used for unknown values when reading from CRI. | ||
const BackfillNetworkName = "_migrated-from-cri" | ||
const BackfillNetworkIface = "unknown" | ||
|
||
|
@@ -201,13 +219,14 @@ type PodIPInfo struct { | |
|
||
// DataStore contains node level ENI/IP | ||
type DataStore struct { | ||
total int | ||
assigned int | ||
eniPool ENIPool | ||
lock sync.Mutex | ||
log logger.Logger | ||
backingStore Checkpointer | ||
cri cri.APIs | ||
total int | ||
assigned int | ||
eniPool ENIPool | ||
lock sync.Mutex | ||
log logger.Logger | ||
CheckpointMigrationPhase int | ||
backingStore Checkpointer | ||
cri cri.APIs | ||
} | ||
|
||
// ENIInfos contains ENI IP information | ||
|
@@ -235,10 +254,11 @@ func prometheusRegister() { | |
func NewDataStore(log logger.Logger, backingStore Checkpointer) *DataStore { | ||
prometheusRegister() | ||
return &DataStore{ | ||
eniPool: make(ENIPool), | ||
log: log, | ||
backingStore: backingStore, | ||
cri: cri.New(), | ||
eniPool: make(ENIPool), | ||
log: log, | ||
backingStore: backingStore, | ||
cri: cri.New(), | ||
CheckpointMigrationPhase: CheckpointMigrationPhase, | ||
} | ||
} | ||
|
||
|
@@ -264,13 +284,13 @@ type CheckpointEntry struct { | |
// configured backing store. Should be called before using data | ||
// store. | ||
func (ds *DataStore) ReadBackingStore() error { | ||
ds.log.Infof("Reading ipam state from backing store") | ||
|
||
var data CheckpointData | ||
err := ds.backingStore.Restore(&data) | ||
ds.log.Debugf("backing store restore returned err %v", err) | ||
if BackfillMissingCheckpointFromCRI && os.IsNotExist(err) { | ||
ds.log.Infof("Backing store not found. Querying CRI.") | ||
|
||
switch ds.CheckpointMigrationPhase { | ||
case 1: | ||
// Phase1: Read from CRI | ||
ds.log.Infof("Reading ipam state from CRI") | ||
|
||
sandboxes, err := ds.cri.GetRunningPodSandboxes(ds.log) | ||
if err != nil { | ||
return err | ||
|
@@ -293,18 +313,28 @@ func (ds *DataStore) ReadBackingStore() error { | |
Allocations: entries, | ||
} | ||
|
||
} else if os.IsNotExist(err) { | ||
// Assume that no file == no containers are | ||
// currently in use, eg a fresh reboot just | ||
// cleared everything out. This is ok, and a | ||
// no-op. | ||
return nil | ||
} else if err != nil { | ||
return fmt.Errorf("datastore: error reading backing store: %v", err) | ||
} | ||
case 2: | ||
// Phase2: Read from checkpoint file | ||
ds.log.Infof("Reading ipam state from backing store") | ||
|
||
err := ds.backingStore.Restore(&data) | ||
ds.log.Debugf("backing store restore returned err %v", err) | ||
if os.IsNotExist(err) { | ||
// Assume that no file == no containers are | ||
// currently in use, eg a fresh reboot just | ||
// cleared everything out. This is ok, and a | ||
// no-op. | ||
return nil | ||
} else if err != nil { | ||
return fmt.Errorf("datastore: error reading backing store: %v", err) | ||
} | ||
|
||
if data.Version != CheckpointFormatVersion { | ||
return fmt.Errorf("datastore: unknown backing store format (%s != %s) - wrong CNI/ipamd version? (Rebooting this node will restart local pods and probably help)", data.Version, CheckpointFormatVersion) | ||
if data.Version != CheckpointFormatVersion { | ||
return fmt.Errorf("datastore: unknown backing store format (%s != %s) - wrong CNI/ipamd version? (Rebooting this node will restart local pods and probably help)", data.Version, CheckpointFormatVersion) | ||
} | ||
|
||
default: | ||
panic(fmt.Sprintf("Unexpected value of CheckpointMigrationPhase: %v", ds.CheckpointMigrationPhase)) | ||
} | ||
|
||
ds.lock.Lock() | ||
|
@@ -329,7 +359,7 @@ func (ds *DataStore) ReadBackingStore() error { | |
ds.log.Debugf("Recovered %s => %s/%s", allocation.IPAMKey, eni.ID, addr.Address) | ||
} | ||
|
||
ds.log.Debugf("Completed reading ipam state from backing store") | ||
ds.log.Debugf("Completed ipam state recovery") | ||
return nil | ||
} | ||
|
||
|
@@ -701,7 +731,16 @@ func (ds *DataStore) UnassignPodIPv4Address(ipamKey IPAMKey) (ip string, deviceN | |
|
||
eni, addr := ds.eniPool.FindAddressForSandbox(ipamKey) | ||
|
||
if BackfillMissingCheckpointFromCRI && addr == nil { | ||
if addr == nil { | ||
// This `if` block should be removed when the CRI | ||
// migration code is finally removed. Leaving a | ||
// compile dependency here to make that obvious :P | ||
var _ = CheckpointMigrationPhase | ||
|
||
// If the entry was discovered by querying CRI at | ||
// restart rather than by observing an ADD operation | ||
// directly, then we won't have captured the true | ||
// networkname/ifname. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we have unified key for in-memory data store and File ? If so we can avoid the above logic. Also AssignIPv4Address doesnt account for key that has been built during restore(i.e., const BackfillNetworkName = "_migrated-from-cri" and const BackfillNetworkIface = "unknown") There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We do: I think the placeholder values are not relevant for Oh, if you're referring to the "return existing assignment for existing IPAMKey" logic in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By unified key, i meant, can we populate entire key into file and restore it back so that we don't have to use const BackfillNetworkName = "_migrated-from-cri" and const BackfillNetworkIface = "unknown ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can't. The only case where we ever use these placeholder values is the CRI->file upgrade case where we don't know the correct values (because CRI doesn't record/expose it). We could drop the network name and iface from the key in all cases (and assume there's only a single network and interface per sandbox), and thus be consistent across upgrade-from-CRI and restart-with-file. I added them because I was trying to actually conform to the CNI spec for the first time, which is quite clear on the intended key:
For completeness: if we just didn't support live CNI upgrades (without waiting for the next node reboot), then we would never have this whole class of challenges - because ADD/DEL would always be paired correctly on a consistent CNI version with a clear "reset" epoch at node reboot. Not a real suggestion, but just pointing out that live-upgrade is a problem we have created for ourselves and why the CNI interface doesn't handle this better. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks Gus for the detailed note. If i understand your explanation, CRI api called during nodeInit doesn't seem to provide all the information required to form complete IPAM key. If this is the case, then CNI spec guideline that you have quoted above cannot be met using current CRI API. I would say operational correctness should win over spec conformance and we can revisit conformance once CRI API can provide everything required to form the key. If CNI_CONTAINERID is guaranteed to be unique enough, we could just have this as part of the key |
||
ds.log.Debugf("UnassignPodIPv4Address: Failed to find IPAM entry under full key, trying CRI-migrated version") | ||
ipamKey.NetworkName = BackfillNetworkName | ||
ipamKey.IfName = BackfillNetworkIface | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -98,6 +98,7 @@ func TestNodeInit(t *testing.T) { | |
dataStore: datastore.NewDataStore(log, datastore.NewTestCheckpoint(fakeCheckpoint)), | ||
myNodeName: myNodeName, | ||
} | ||
mockContext.dataStore.CheckpointMigrationPhase = 2 | ||
|
||
eni1, eni2 := getDummyENIMetadata() | ||
|
||
|
@@ -518,7 +519,9 @@ func TestIPAMContext_nodeIPPoolTooLow(t *testing.T) { | |
} | ||
|
||
func testDatastore() *datastore.DataStore { | ||
return datastore.NewDataStore(log, datastore.NewTestCheckpoint(datastore.CheckpointData{Version: datastore.CheckpointFormatVersion})) | ||
ds := datastore.NewDataStore(log, datastore.NewTestCheckpoint(datastore.CheckpointData{Version: datastore.CheckpointFormatVersion})) | ||
ds.CheckpointMigrationPhase = 2 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since |
||
return ds | ||
} | ||
|
||
func datastoreWith3FreeIPs() *datastore.DataStore { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks a lot for the clear documentation. I'll update the readme as well in a follow up PR