generate same file list

This commit is contained in:
yp05327 2023-06-14 02:02:08 +00:00
parent f45d59263c
commit 84c7b98aaa
3 changed files with 210 additions and 23 deletions

View File

@ -4,7 +4,9 @@ package main
import (
"archive/tar"
"bytes"
"compress/gzip"
"crypto/md5"
"flag"
"fmt"
"io"
@ -15,6 +17,7 @@ import (
"path/filepath"
"strings"
"code.gitea.io/gitea/modules/json"
"code.gitea.io/gitea/modules/util"
)
@ -77,7 +80,9 @@ func main() {
}
tr := tar.NewReader(gz)
var pf *os.File
var pfn string
sameFiles := make(map[string][]string)
for {
hdr, err := tr.Next()
@ -97,14 +102,17 @@ func main() {
continue
}
if strings.HasPrefix(filepath.Base(hdr.Name), "README") {
fbn := filepath.Base(hdr.Name)
ln := strings.TrimSuffix(fbn, ".txt")
if strings.HasPrefix(fbn, "README") {
continue
}
if strings.HasPrefix(filepath.Base(hdr.Name), "deprecated_") {
if strings.HasPrefix(fbn, "deprecated_") {
continue
}
out, err := os.Create(path.Join(destination, strings.TrimSuffix(filepath.Base(hdr.Name), ".txt")))
out, err := os.Create(path.Join(destination, ln))
if err != nil {
log.Fatalf("Failed to create new file. %s", err)
}
@ -115,8 +123,158 @@ func main() {
log.Fatalf("Failed to write new file. %s", err)
} else {
fmt.Printf("Written %s\n", out.Name())
// some license files have same content, so we need to detect these files and create a convert map into a file
// In InitClassifier, we will use this convert map to avoid adding same license content with different license name
md5, err := getSameFileMD5(pf, out)
if err != nil {
log.Fatalf("Failed to get same file md5. %s", err)
continue
}
if md5 != "" {
_, ok := sameFiles[md5]
if !ok {
sameFiles[md5] = make([]string, 0)
}
if !contains(sameFiles[md5], pfn) {
sameFiles[md5] = append(sameFiles[md5], pfn)
}
sameFiles[md5] = append(sameFiles[md5], ln)
}
pf = out
pfn = ln
}
}
// generate convert license name map
convertLicenseName := make(map[string]string)
for _, fileNames := range sameFiles {
key := getLicenseKey(fileNames)
for _, fileName := range fileNames {
convertLicenseName[fileName] = key
}
}
// save convert license name map to file
bytes, err := json.Marshal(convertLicenseName)
if err != nil {
log.Fatalf("Failed to create json bytes. %s", err)
return
}
// TODO change the path
path := "options/convertLicenseName"
out, err := os.Create(path)
if err != nil {
log.Fatalf("Failed to create new file. %s", err)
}
defer out.Close()
_, err = out.Write(bytes)
if err != nil {
log.Fatalf("Failed to write %s. %s", path, err)
}
fmt.Println("Done")
}
// getSameFileMD5 returns md5 of the input file, if the content of input files are same
func getSameFileMD5(f1, f2 *os.File) (string, error) {
if f1 == nil || f2 == nil {
return "", nil
}
// check file size
fs1, err := f1.Stat()
if err != nil {
return "", err
}
fs2, err := f2.Stat()
if err != nil {
return "", err
}
if fs1.Size() != fs2.Size() {
return "", nil
}
// check content
var chunkSize = 1024
_, err = f1.Seek(0, 0)
if err != nil {
return "", err
}
_, err = f2.Seek(0, 0)
if err != nil {
return "", err
}
var totalBytes []byte
for {
b1 := make([]byte, chunkSize)
_, err1 := f1.Read(b1)
b2 := make([]byte, chunkSize)
_, err2 := f2.Read(b2)
totalBytes = append(totalBytes, b1...)
if err1 != nil || err2 != nil {
if err1 == io.EOF && err2 == io.EOF {
md5 := md5.Sum(totalBytes)
return string(md5[:]), nil
} else if err1 == io.EOF || err2 == io.EOF {
return "", nil
} else if err1 != nil {
return "", err1
} else if err2 != nil {
return "", err2
}
}
if !bytes.Equal(b1, b2) {
return "", nil
}
}
}
func getLicenseKey(fnl []string) string {
if len(fnl) == 0 {
return ""
}
shortestItem := func(list []string) string {
s := list[0]
for _, l := range list[1:] {
if len(l) < len(s) {
s = l
}
}
return s
}
allHasPrefix := func(list []string, s string) bool {
for _, l := range list {
if !strings.HasPrefix(l, s) {
return false
}
}
return true
}
sl := shortestItem(fnl)
slv := strings.Split(sl, "-")
var result string
for i := len(slv); i >= 0; i-- {
result = strings.Join(slv[:i], "-")
if allHasPrefix(fnl, result) {
return result
}
}
return ""
}
func contains(s []string, e string) bool {
for _, a := range s {
if a == e {
return true
}
}
return false
}

View File

@ -7,6 +7,7 @@ import (
"bufio"
"bytes"
"context"
"encoding/json"
"fmt"
"regexp"
"strings"
@ -23,27 +24,56 @@ import (
var classifier *licenseclassifier.Classifier
func init() {
err := InitClassifier()
cln, err := getConvertLicenseName()
if err != nil {
log.Error("getConvertLicenseName: %v", err)
}
err = initClassifier(cln)
if err != nil {
log.Error("initClassifier: %v", err)
}
}
func InitClassifier() error {
func getConvertLicenseName() (map[string]string, error) {
data, err := options.AssetFS().ReadFile("", "convertLicenseName")
if err != nil {
return nil, err
}
var convertLicenseName map[string]string
err = json.Unmarshal([]byte(data), &convertLicenseName)
if err != nil {
return nil, err
}
return convertLicenseName, nil
}
func initClassifier(convertLicenseName map[string]string) error {
// threshold should be 0.84~0.86 or the test will be failed
// TODO: add threshold to app.ini
classifier = licenseclassifier.NewClassifier(.9)
classifier = licenseclassifier.NewClassifier(.85)
licenseFiles, err := options.AssetFS().ListFiles("license", true)
if err != nil {
return err
}
licenseVariantCount := make(map[string]int)
if len(licenseFiles) > 0 {
for _, lf := range licenseFiles {
data, err := options.License(lf)
if err != nil {
return err
}
classifier.AddContent("License", lf, "license", data)
variant := lf
if convertLicenseName != nil {
v, ok := convertLicenseName[lf]
if ok {
variant = v
}
licenseVariantCount[variant]++
if licenseVariantCount[variant] > 1 {
continue
}
}
classifier.AddContent("License", lf, variant, data)
}
}
return nil
@ -244,21 +274,10 @@ func detectLicense(buf []byte) []string {
}
matches := classifier.Match(buf)
licenseVariants := make(map[string][]string, len(matches.Matches))
var results []string
for _, r := range matches.Matches {
if r.MatchType == "License" {
tag := fmt.Sprintf("%d-%d", r.StartLine, r.EndLine)
licenseVariants[tag] = append(licenseVariants[tag], r.Name)
}
}
var results []string
for _, licenses := range licenseVariants {
if len(licenses) == 1 {
results = append(results, licenses[0])
} else {
// TODO: reslove license detection conflict
results = append(results, licenses...)
results = append(results, r.Variant)
}
}
return results

View File

@ -202,6 +202,8 @@ func Test_detectLicense(t *testing.T) {
}
LoadRepoConfig()
convertLicenseName, err := getConvertLicenseName()
assert.NoError(t, err)
for _, licenseName := range Licenses {
license, err := getLicense(licenseName, &licenseValues{
Owner: "Gitea",
@ -211,20 +213,28 @@ func Test_detectLicense(t *testing.T) {
})
assert.NoError(t, err)
variant := licenseName
if convertLicenseName != nil {
v, ok := convertLicenseName[licenseName]
if ok {
variant = v
}
}
tests = append(tests, DetectLicenseTest{
name: fmt.Sprintf("auto single license test: %s", licenseName),
arg: license,
want: []string{licenseName},
want: []string{variant},
})
}
tests = append(tests, DetectLicenseTest{
name: fmt.Sprintf("auto multiple license test: %s and %s", tests[2].want[0], tests[3].want[0]),
arg: append(tests[2].arg, tests[3].arg...),
// TODO doesn't depend on the order
want: []string{"389-exception", "0BSD"},
})
err := InitClassifier()
err = initClassifier(convertLicenseName)
assert.NoError(t, err)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {