From f88984073935b0eff4d7e7efdc3338c793cab406 Mon Sep 17 00:00:00 2001 From: Yadong Ding Date: Thu, 24 Aug 2023 12:01:48 +0800 Subject: [PATCH] fix: gc works unexpeted in high concurrent When acceld converts multiple images with large size differences, for example : alpine and WordPress. Obviously alpine will finish the task first, and WordPress is still in the pulling phase. If alpine triggers gc(decide by configuration), GC will clear the blobs of WordPress, because the lease of WordPress blob had cached in lease and only used once(in commit). To solve this problem, now gc will wait all converting tasks completed. fix #174 Signed-off-by: Yadong Ding --- pkg/adapter/adapter.go | 3 +++ pkg/content/content.go | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/pkg/adapter/adapter.go b/pkg/adapter/adapter.go index 37318091..1a831495 100644 --- a/pkg/adapter/adapter.go +++ b/pkg/adapter/adapter.go @@ -113,9 +113,12 @@ func (adp *LocalAdapter) Convert(ctx context.Context, source string) error { if err = adp.content.NewRemoteCache(cacheRef); err != nil { return err } + adp.content.GcMutex.RLock() if _, err = adp.cvt.Convert(ctx, source, target, cacheRef); err != nil { + adp.content.GcMutex.RUnlock() return err } + adp.content.GcMutex.RUnlock() if err := adp.content.GC(ctx); err != nil { return err } diff --git a/pkg/content/content.go b/pkg/content/content.go index 6612aae6..0349e3a3 100644 --- a/pkg/content/content.go +++ b/pkg/content/content.go @@ -19,6 +19,7 @@ import ( "fmt" "path/filepath" "strconv" + "sync" "time" ctrcontent "github.com/containerd/containerd/content" @@ -47,6 +48,8 @@ type Content struct { lm leases.Manager // gcSingleflight help to resolve concurrent gc gcSingleflight *singleflight.Group + // GcMutex works between gc and convert + GcMutex *sync.RWMutex // lc cache the used count and reference order of lease lc *leaseCache // store is the local content store wrapped inner db @@ -95,6 +98,7 @@ func NewContent(contentDir string, databaseDir string, threshold string, useRemo db: db, lm: metadata.NewLeaseManager(db), gcSingleflight: &singleflight.Group{}, + GcMutex: &sync.RWMutex{}, lc: lc, store: db.ContentStore(), threshold: int64(t), @@ -137,6 +141,13 @@ func (content *Content) GC(ctx context.Context) error { // if the local content size over eighty percent of threshold, gc start if size > (content.threshold*int64(80))/100 { if _, err, _ := content.gcSingleflight.Do(accelerationServiceNamespace, func() (interface{}, error) { + content.GcMutex.Lock() + defer content.GcMutex.Unlock() + // recalculate the local cache size + size, err := content.Size() + if err != nil { + return nil, err + } return nil, content.garbageCollect(ctx, size-(content.threshold*int64(80))/100) }); err != nil { return err