diff --git a/build/generate-licenses.go b/build/generate-licenses.go index 9a111bc811..7909f1ee39 100644 --- a/build/generate-licenses.go +++ b/build/generate-licenses.go @@ -4,7 +4,9 @@ package main import ( "archive/tar" + "bytes" "compress/gzip" + "crypto/md5" "flag" "fmt" "io" @@ -15,6 +17,7 @@ import ( "path/filepath" "strings" + "code.gitea.io/gitea/modules/json" "code.gitea.io/gitea/modules/util" ) @@ -77,7 +80,9 @@ func main() { } tr := tar.NewReader(gz) - + var pf *os.File + var pfn string + sameFiles := make(map[string][]string) for { hdr, err := tr.Next() @@ -97,14 +102,17 @@ func main() { continue } - if strings.HasPrefix(filepath.Base(hdr.Name), "README") { + fbn := filepath.Base(hdr.Name) + ln := strings.TrimSuffix(fbn, ".txt") + + if strings.HasPrefix(fbn, "README") { continue } - if strings.HasPrefix(filepath.Base(hdr.Name), "deprecated_") { + if strings.HasPrefix(fbn, "deprecated_") { continue } - out, err := os.Create(path.Join(destination, strings.TrimSuffix(filepath.Base(hdr.Name), ".txt"))) + out, err := os.Create(path.Join(destination, ln)) if err != nil { log.Fatalf("Failed to create new file. %s", err) } @@ -115,8 +123,158 @@ func main() { log.Fatalf("Failed to write new file. %s", err) } else { fmt.Printf("Written %s\n", out.Name()) + + // some license files have same content, so we need to detect these files and create a convert map into a file + // In InitClassifier, we will use this convert map to avoid adding same license content with different license name + md5, err := getSameFileMD5(pf, out) + if err != nil { + log.Fatalf("Failed to get same file md5. %s", err) + continue + } + if md5 != "" { + _, ok := sameFiles[md5] + if !ok { + sameFiles[md5] = make([]string, 0) + } + if !contains(sameFiles[md5], pfn) { + sameFiles[md5] = append(sameFiles[md5], pfn) + } + sameFiles[md5] = append(sameFiles[md5], ln) + } + pf = out + pfn = ln } } + // generate convert license name map + convertLicenseName := make(map[string]string) + for _, fileNames := range sameFiles { + key := getLicenseKey(fileNames) + for _, fileName := range fileNames { + convertLicenseName[fileName] = key + } + } + // save convert license name map to file + bytes, err := json.Marshal(convertLicenseName) + if err != nil { + log.Fatalf("Failed to create json bytes. %s", err) + return + } + // TODO change the path + path := "options/convertLicenseName" + out, err := os.Create(path) + if err != nil { + log.Fatalf("Failed to create new file. %s", err) + } + defer out.Close() + _, err = out.Write(bytes) + if err != nil { + log.Fatalf("Failed to write %s. %s", path, err) + } + fmt.Println("Done") } + +// getSameFileMD5 returns md5 of the input file, if the content of input files are same +func getSameFileMD5(f1, f2 *os.File) (string, error) { + if f1 == nil || f2 == nil { + return "", nil + } + + // check file size + fs1, err := f1.Stat() + if err != nil { + return "", err + } + fs2, err := f2.Stat() + if err != nil { + return "", err + } + + if fs1.Size() != fs2.Size() { + return "", nil + } + + // check content + var chunkSize = 1024 + _, err = f1.Seek(0, 0) + if err != nil { + return "", err + } + _, err = f2.Seek(0, 0) + if err != nil { + return "", err + } + + var totalBytes []byte + for { + b1 := make([]byte, chunkSize) + _, err1 := f1.Read(b1) + + b2 := make([]byte, chunkSize) + _, err2 := f2.Read(b2) + + totalBytes = append(totalBytes, b1...) + + if err1 != nil || err2 != nil { + if err1 == io.EOF && err2 == io.EOF { + md5 := md5.Sum(totalBytes) + return string(md5[:]), nil + } else if err1 == io.EOF || err2 == io.EOF { + return "", nil + } else if err1 != nil { + return "", err1 + } else if err2 != nil { + return "", err2 + } + } + + if !bytes.Equal(b1, b2) { + return "", nil + } + } +} + +func getLicenseKey(fnl []string) string { + if len(fnl) == 0 { + return "" + } + + shortestItem := func(list []string) string { + s := list[0] + for _, l := range list[1:] { + if len(l) < len(s) { + s = l + } + } + return s + } + allHasPrefix := func(list []string, s string) bool { + for _, l := range list { + if !strings.HasPrefix(l, s) { + return false + } + } + return true + } + + sl := shortestItem(fnl) + slv := strings.Split(sl, "-") + var result string + for i := len(slv); i >= 0; i-- { + result = strings.Join(slv[:i], "-") + if allHasPrefix(fnl, result) { + return result + } + } + return "" +} + +func contains(s []string, e string) bool { + for _, a := range s { + if a == e { + return true + } + } + return false +} diff --git a/modules/repository/license.go b/modules/repository/license.go index 160ec28f01..dc9584f39f 100644 --- a/modules/repository/license.go +++ b/modules/repository/license.go @@ -7,6 +7,7 @@ import ( "bufio" "bytes" "context" + "encoding/json" "fmt" "regexp" "strings" @@ -23,27 +24,56 @@ import ( var classifier *licenseclassifier.Classifier func init() { - err := InitClassifier() + cln, err := getConvertLicenseName() + if err != nil { + log.Error("getConvertLicenseName: %v", err) + } + err = initClassifier(cln) if err != nil { log.Error("initClassifier: %v", err) } } -func InitClassifier() error { +func getConvertLicenseName() (map[string]string, error) { + data, err := options.AssetFS().ReadFile("", "convertLicenseName") + if err != nil { + return nil, err + } + var convertLicenseName map[string]string + err = json.Unmarshal([]byte(data), &convertLicenseName) + if err != nil { + return nil, err + } + return convertLicenseName, nil +} +func initClassifier(convertLicenseName map[string]string) error { + // threshold should be 0.84~0.86 or the test will be failed // TODO: add threshold to app.ini - classifier = licenseclassifier.NewClassifier(.9) + classifier = licenseclassifier.NewClassifier(.85) licenseFiles, err := options.AssetFS().ListFiles("license", true) if err != nil { return err } + licenseVariantCount := make(map[string]int) if len(licenseFiles) > 0 { for _, lf := range licenseFiles { data, err := options.License(lf) if err != nil { return err } - classifier.AddContent("License", lf, "license", data) + variant := lf + if convertLicenseName != nil { + v, ok := convertLicenseName[lf] + if ok { + variant = v + } + licenseVariantCount[variant]++ + if licenseVariantCount[variant] > 1 { + continue + } + } + classifier.AddContent("License", lf, variant, data) } } return nil @@ -244,21 +274,10 @@ func detectLicense(buf []byte) []string { } matches := classifier.Match(buf) - licenseVariants := make(map[string][]string, len(matches.Matches)) + var results []string for _, r := range matches.Matches { if r.MatchType == "License" { - tag := fmt.Sprintf("%d-%d", r.StartLine, r.EndLine) - licenseVariants[tag] = append(licenseVariants[tag], r.Name) - } - } - - var results []string - for _, licenses := range licenseVariants { - if len(licenses) == 1 { - results = append(results, licenses[0]) - } else { - // TODO: reslove license detection conflict - results = append(results, licenses...) + results = append(results, r.Variant) } } return results diff --git a/modules/repository/license_test.go b/modules/repository/license_test.go index 753877ec2b..7621d8325f 100644 --- a/modules/repository/license_test.go +++ b/modules/repository/license_test.go @@ -202,6 +202,8 @@ func Test_detectLicense(t *testing.T) { } LoadRepoConfig() + convertLicenseName, err := getConvertLicenseName() + assert.NoError(t, err) for _, licenseName := range Licenses { license, err := getLicense(licenseName, &licenseValues{ Owner: "Gitea", @@ -211,20 +213,28 @@ func Test_detectLicense(t *testing.T) { }) assert.NoError(t, err) + variant := licenseName + if convertLicenseName != nil { + v, ok := convertLicenseName[licenseName] + if ok { + variant = v + } + } tests = append(tests, DetectLicenseTest{ name: fmt.Sprintf("auto single license test: %s", licenseName), arg: license, - want: []string{licenseName}, + want: []string{variant}, }) } tests = append(tests, DetectLicenseTest{ name: fmt.Sprintf("auto multiple license test: %s and %s", tests[2].want[0], tests[3].want[0]), arg: append(tests[2].arg, tests[3].arg...), + // TODO doesn't depend on the order want: []string{"389-exception", "0BSD"}, }) - err := InitClassifier() + err = initClassifier(convertLicenseName) assert.NoError(t, err) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) {