From 0657bd34f81a5a7c18faacfb4364ed942cda6a81 Mon Sep 17 00:00:00 2001 From: Lauris BH Date: Sat, 30 May 2020 10:46:15 +0300 Subject: [PATCH 1/2] Change language statistics to save size instead of percentage (#11681) * Change language statistics to save size instead of percentage in database Co-Authored-By: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com> * Do not exclude if only language * Fix edge cases with special langauges Co-authored-by: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com> --- models/migrations/migrations.go | 2 + models/migrations/v140.go | 56 +++++++++++++++ models/repo_language_stats.go | 100 +++++++++++++++++++++----- modules/git/repo_language_stats.go | 25 ++----- modules/indexer/stats/indexer_test.go | 3 + 5 files changed, 149 insertions(+), 37 deletions(-) create mode 100644 models/migrations/v140.go diff --git a/models/migrations/migrations.go b/models/migrations/migrations.go index 00d84da2e867..869661aee423 100644 --- a/models/migrations/migrations.go +++ b/models/migrations/migrations.go @@ -212,6 +212,8 @@ var migrations = []Migration{ NewMigration("Add ResolveDoerID to Comment table", addResolveDoerIDCommentColumn), // v139 -> v140 NewMigration("prepend refs/heads/ to issue refs", prependRefsHeadsToIssueRefs), + // v140 -> v141 + NewMigration("Save detected language file size to database instead of percent", fixLanguageStatsToSaveSize), } // GetCurrentDBVersion returns the current db version diff --git a/models/migrations/v140.go b/models/migrations/v140.go new file mode 100644 index 000000000000..871d14b84eec --- /dev/null +++ b/models/migrations/v140.go @@ -0,0 +1,56 @@ +// Copyright 2020 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package migrations + +import ( + "fmt" + + "code.gitea.io/gitea/modules/setting" + + "xorm.io/xorm" +) + +func fixLanguageStatsToSaveSize(x *xorm.Engine) error { + // LanguageStat see models/repo_language_stats.go + type LanguageStat struct { + Size int64 `xorm:"NOT NULL DEFAULT 0"` + } + + // RepoIndexerType specifies the repository indexer type + type RepoIndexerType int + + const ( + // RepoIndexerTypeCode code indexer + RepoIndexerTypeCode RepoIndexerType = iota // 0 + // RepoIndexerTypeStats repository stats indexer + RepoIndexerTypeStats // 1 + ) + + // RepoIndexerStatus see models/repo_indexer.go + type RepoIndexerStatus struct { + IndexerType RepoIndexerType `xorm:"INDEX(s) NOT NULL DEFAULT 0"` + } + + if err := x.Sync2(new(LanguageStat)); err != nil { + return fmt.Errorf("Sync2: %v", err) + } + + x.Delete(&RepoIndexerStatus{IndexerType: RepoIndexerTypeStats}) + + // Delete language stat statuses + truncExpr := "TRUNCATE TABLE" + if setting.Database.UseSQLite3 { + truncExpr = "DELETE FROM" + } + + // Delete language stats + if _, err := x.Exec(fmt.Sprintf("%s language_stat", truncExpr)); err != nil { + return err + } + + sess := x.NewSession() + defer sess.Close() + return dropTableColumns(sess, "language_stat", "percentage") +} diff --git a/models/repo_language_stats.go b/models/repo_language_stats.go index 5f1aed1f3022..d08782eaf8a3 100644 --- a/models/repo_language_stats.go +++ b/models/repo_language_stats.go @@ -20,11 +20,28 @@ type LanguageStat struct { CommitID string IsPrimary bool Language string `xorm:"VARCHAR(30) UNIQUE(s) INDEX NOT NULL"` - Percentage float32 `xorm:"NUMERIC(5,2) NOT NULL DEFAULT 0"` + Percentage float32 `xorm:"-"` + Size int64 `xorm:"NOT NULL DEFAULT 0"` Color string `xorm:"-"` CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"` } +// specialLanguages defines list of languages that are excluded from the calculation +// unless they are the only language present in repository. Only languages which under +// normal circumstances are not considered to be code should be listed here. +var specialLanguages = map[string]struct{}{ + "XML": {}, + "JSON": {}, + "TOML": {}, + "YAML": {}, + "INI": {}, + "SQL": {}, + "SVG": {}, + "Text": {}, + "Markdown": {}, + "other": {}, +} + // LanguageStatList defines a list of language statistics type LanguageStatList []*LanguageStat @@ -34,12 +51,53 @@ func (stats LanguageStatList) loadAttributes() { } } +func (stats LanguageStatList) getLanguagePercentages() map[string]float32 { + langPerc := make(map[string]float32) + var otherPerc float32 = 100 + var total int64 + // Check that repository has at least one non-special language + var skipSpecial bool + for _, stat := range stats { + if _, ok := specialLanguages[stat.Language]; !ok { + skipSpecial = true + break + } + } + for _, stat := range stats { + // Exclude specific languages from percentage calculation + if _, ok := specialLanguages[stat.Language]; ok && skipSpecial { + continue + } + total += stat.Size + } + if total > 0 { + for _, stat := range stats { + // Exclude specific languages from percentage calculation + if _, ok := specialLanguages[stat.Language]; ok && skipSpecial { + continue + } + perc := float32(math.Round(float64(stat.Size)/float64(total)*1000) / 10) + if perc <= 0.1 { + continue + } + otherPerc -= perc + langPerc[stat.Language] = perc + } + otherPerc = float32(math.Round(float64(otherPerc)*10) / 10) + } else { + otherPerc = 100 + } + if otherPerc > 0 { + langPerc["other"] = otherPerc + } + return langPerc +} + func (repo *Repository) getLanguageStats(e Engine) (LanguageStatList, error) { stats := make(LanguageStatList, 0, 6) - if err := e.Where("`repo_id` = ?", repo.ID).Desc("`percentage`").Find(&stats); err != nil { + if err := e.Where("`repo_id` = ?", repo.ID).Desc("`size`").Find(&stats); err != nil { return nil, err } - stats.loadAttributes() return stats, nil } @@ -54,13 +112,18 @@ func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error) if err != nil { return nil, err } + perc := stats.getLanguagePercentages() topstats := make(LanguageStatList, 0, limit) var other float32 for i := range stats { + if _, ok := perc[stats[i].Language]; !ok { + continue + } if stats[i].Language == "other" || len(topstats) >= limit { - other += stats[i].Percentage + other += perc[stats[i].Language] continue } + stats[i].Percentage = perc[stats[i].Language] topstats = append(topstats, stats[i]) } if other > 0 { @@ -71,11 +134,12 @@ func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error) Percentage: float32(math.Round(float64(other)*10) / 10), }) } + topstats.loadAttributes() return topstats, nil } // UpdateLanguageStats updates the language statistics for repository -func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]float32) error { +func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]int64) error { sess := x.NewSession() if err := sess.Begin(); err != nil { return err @@ -87,15 +151,15 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl return err } var topLang string - var p float32 - for lang, perc := range stats { - if perc > p { - p = perc + var s int64 + for lang, size := range stats { + if size > s { + s = size topLang = strings.ToLower(lang) } } - for lang, perc := range stats { + for lang, size := range stats { upd := false llang := strings.ToLower(lang) for _, s := range oldstats { @@ -103,8 +167,8 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl if strings.ToLower(s.Language) == llang { s.CommitID = commitID s.IsPrimary = llang == topLang - s.Percentage = perc - if _, err := sess.ID(s.ID).Cols("`commit_id`", "`percentage`", "`is_primary`").Update(s); err != nil { + s.Size = size + if _, err := sess.ID(s.ID).Cols("`commit_id`", "`size`", "`is_primary`").Update(s); err != nil { return err } upd = true @@ -114,11 +178,11 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl // Insert new language if !upd { if _, err := sess.Insert(&LanguageStat{ - RepoID: repo.ID, - CommitID: commitID, - IsPrimary: llang == topLang, - Language: lang, - Percentage: perc, + RepoID: repo.ID, + CommitID: commitID, + IsPrimary: llang == topLang, + Language: lang, + Size: size, }); err != nil { return err } @@ -153,7 +217,7 @@ func CopyLanguageStat(originalRepo, destRepo *Repository) error { return err } RepoLang := make(LanguageStatList, 0, 6) - if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`percentage`").Find(&RepoLang); err != nil { + if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`size`").Find(&RepoLang); err != nil { return err } if len(RepoLang) > 0 { diff --git a/modules/git/repo_language_stats.go b/modules/git/repo_language_stats.go index 8ff8fa20c1f6..d623d6f57d3e 100644 --- a/modules/git/repo_language_stats.go +++ b/modules/git/repo_language_stats.go @@ -8,7 +8,6 @@ import ( "bytes" "io" "io/ioutil" - "math" "code.gitea.io/gitea/modules/analyze" @@ -21,7 +20,7 @@ import ( const fileSizeLimit int64 = 16 * 1024 * 1024 // GetLanguageStats calculates language stats for git repository at specified commit -func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, error) { +func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { r, err := git.PlainOpen(repo.Path) if err != nil { return nil, err @@ -43,7 +42,6 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e } sizes := make(map[string]int64) - var total int64 err = tree.Files().ForEach(func(f *object.File) error { if enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) || enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { @@ -60,11 +58,10 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e language := analyze.GetCodeLanguage(f.Name, content) if language == enry.OtherLanguage || language == "" { - return nil + language = "other" } sizes[language] += f.Size - total += f.Size return nil }) @@ -72,21 +69,11 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e return nil, err } - stats := make(map[string]float32) - var otherPerc float32 = 100 - for language, size := range sizes { - perc := float32(math.Round(float64(size)/float64(total)*1000) / 10) - if perc <= 0.1 { - continue - } - otherPerc -= perc - stats[language] = perc + if len(sizes) == 0 { + sizes["other"] = 0 } - otherPerc = float32(math.Round(float64(otherPerc)*10) / 10) - if otherPerc > 0 { - stats["other"] = otherPerc - } - return stats, nil + + return sizes, nil } func readFile(f *object.File, limit int64) ([]byte, error) { diff --git a/modules/indexer/stats/indexer_test.go b/modules/indexer/stats/indexer_test.go index 29d0f6dbe4a6..b60c6d9bb4d3 100644 --- a/modules/indexer/stats/indexer_test.go +++ b/modules/indexer/stats/indexer_test.go @@ -34,6 +34,9 @@ func TestRepoStatsIndex(t *testing.T) { repo, err := models.GetRepositoryByID(1) assert.NoError(t, err) + status, err := repo.GetIndexerStatus(models.RepoIndexerTypeStats) + assert.NoError(t, err) + assert.Equal(t, "65f1bf27bc3bf70f64657658635e66094edbcb4d", status.CommitSha) langs, err := repo.GetTopLanguageStats(5) assert.NoError(t, err) assert.Len(t, langs, 1) From 855f730e3b9f689492691e131616ad992b66604b Mon Sep 17 00:00:00 2001 From: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com> Date: Sun, 31 May 2020 00:58:55 +0200 Subject: [PATCH 2/2] Fix language stat calculation (#11692) * Fix language stat calculation * Group languages and ignore 0 size files * remove unneeded code --- models/repo_language_stats.go | 35 +-------------------------- modules/git/repo_language_stats.go | 31 +++++++++++++++++++++--- modules/indexer/stats/indexer_test.go | 4 +-- 3 files changed, 29 insertions(+), 41 deletions(-) diff --git a/models/repo_language_stats.go b/models/repo_language_stats.go index d08782eaf8a3..a15063e25a6e 100644 --- a/models/repo_language_stats.go +++ b/models/repo_language_stats.go @@ -26,22 +26,6 @@ type LanguageStat struct { CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"` } -// specialLanguages defines list of languages that are excluded from the calculation -// unless they are the only language present in repository. Only languages which under -// normal circumstances are not considered to be code should be listed here. -var specialLanguages = map[string]struct{}{ - "XML": {}, - "JSON": {}, - "TOML": {}, - "YAML": {}, - "INI": {}, - "SQL": {}, - "SVG": {}, - "Text": {}, - "Markdown": {}, - "other": {}, -} - // LanguageStatList defines a list of language statistics type LanguageStatList []*LanguageStat @@ -55,27 +39,12 @@ func (stats LanguageStatList) getLanguagePercentages() map[string]float32 { langPerc := make(map[string]float32) var otherPerc float32 = 100 var total int64 - // Check that repository has at least one non-special language - var skipSpecial bool - for _, stat := range stats { - if _, ok := specialLanguages[stat.Language]; !ok { - skipSpecial = true - break - } - } + for _, stat := range stats { - // Exclude specific languages from percentage calculation - if _, ok := specialLanguages[stat.Language]; ok && skipSpecial { - continue - } total += stat.Size } if total > 0 { for _, stat := range stats { - // Exclude specific languages from percentage calculation - if _, ok := specialLanguages[stat.Language]; ok && skipSpecial { - continue - } perc := float32(math.Round(float64(stat.Size)/float64(total)*1000) / 10) if perc <= 0.1 { continue @@ -84,8 +53,6 @@ func (stats LanguageStatList) getLanguagePercentages() map[string]float32 { langPerc[stat.Language] = perc } otherPerc = float32(math.Round(float64(otherPerc)*10) / 10) - } else { - otherPerc = 100 } if otherPerc > 0 { langPerc["other"] = otherPerc diff --git a/modules/git/repo_language_stats.go b/modules/git/repo_language_stats.go index d623d6f57d3e..06d7d6aba02e 100644 --- a/modules/git/repo_language_stats.go +++ b/modules/git/repo_language_stats.go @@ -19,6 +19,20 @@ import ( const fileSizeLimit int64 = 16 * 1024 * 1024 +// specialLanguages defines list of languages that are excluded from the calculation +// unless they are the only language present in repository. Only languages which under +// normal circumstances are not considered to be code should be listed here. +var specialLanguages = []string{ + "XML", + "JSON", + "TOML", + "YAML", + "INI", + "SVG", + "Text", + "Markdown", +} + // GetLanguageStats calculates language stats for git repository at specified commit func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { r, err := git.PlainOpen(repo.Path) @@ -43,7 +57,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err sizes := make(map[string]int64) err = tree.Files().ForEach(func(f *object.File) error { - if enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) || + if f.Size == 0 || enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) || enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { return nil } @@ -58,7 +72,13 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err language := analyze.GetCodeLanguage(f.Name, content) if language == enry.OtherLanguage || language == "" { - language = "other" + return nil + } + + // group languages, such as Pug -> HTML; SCSS -> CSS + group := enry.GetLanguageGroup(language) + if group != "" { + language = group } sizes[language] += f.Size @@ -69,8 +89,11 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return nil, err } - if len(sizes) == 0 { - sizes["other"] = 0 + // filter special languages unless they are the only language + if len(sizes) > 1 { + for _, language := range specialLanguages { + delete(sizes, language) + } } return sizes, nil diff --git a/modules/indexer/stats/indexer_test.go b/modules/indexer/stats/indexer_test.go index b60c6d9bb4d3..4bcbaa942377 100644 --- a/modules/indexer/stats/indexer_test.go +++ b/modules/indexer/stats/indexer_test.go @@ -39,7 +39,5 @@ func TestRepoStatsIndex(t *testing.T) { assert.Equal(t, "65f1bf27bc3bf70f64657658635e66094edbcb4d", status.CommitSha) langs, err := repo.GetTopLanguageStats(5) assert.NoError(t, err) - assert.Len(t, langs, 1) - assert.Equal(t, "other", langs[0].Language) - assert.Equal(t, float32(100), langs[0].Percentage) + assert.Empty(t, langs) }