mirror of
https://github.com/go-gitea/gitea
synced 2024-10-19 20:10:10 +02:00
generate same file list
This commit is contained in:
parent
f45d59263c
commit
84c7b98aaa
@ -4,7 +4,9 @@ package main
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"crypto/md5"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
@ -15,6 +17,7 @@ import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"code.gitea.io/gitea/modules/json"
|
||||
"code.gitea.io/gitea/modules/util"
|
||||
)
|
||||
|
||||
@ -77,7 +80,9 @@ func main() {
|
||||
}
|
||||
|
||||
tr := tar.NewReader(gz)
|
||||
|
||||
var pf *os.File
|
||||
var pfn string
|
||||
sameFiles := make(map[string][]string)
|
||||
for {
|
||||
hdr, err := tr.Next()
|
||||
|
||||
@ -97,14 +102,17 @@ func main() {
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.HasPrefix(filepath.Base(hdr.Name), "README") {
|
||||
fbn := filepath.Base(hdr.Name)
|
||||
ln := strings.TrimSuffix(fbn, ".txt")
|
||||
|
||||
if strings.HasPrefix(fbn, "README") {
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.HasPrefix(filepath.Base(hdr.Name), "deprecated_") {
|
||||
if strings.HasPrefix(fbn, "deprecated_") {
|
||||
continue
|
||||
}
|
||||
out, err := os.Create(path.Join(destination, strings.TrimSuffix(filepath.Base(hdr.Name), ".txt")))
|
||||
out, err := os.Create(path.Join(destination, ln))
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to create new file. %s", err)
|
||||
}
|
||||
@ -115,8 +123,158 @@ func main() {
|
||||
log.Fatalf("Failed to write new file. %s", err)
|
||||
} else {
|
||||
fmt.Printf("Written %s\n", out.Name())
|
||||
|
||||
// some license files have same content, so we need to detect these files and create a convert map into a file
|
||||
// In InitClassifier, we will use this convert map to avoid adding same license content with different license name
|
||||
md5, err := getSameFileMD5(pf, out)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to get same file md5. %s", err)
|
||||
continue
|
||||
}
|
||||
if md5 != "" {
|
||||
_, ok := sameFiles[md5]
|
||||
if !ok {
|
||||
sameFiles[md5] = make([]string, 0)
|
||||
}
|
||||
if !contains(sameFiles[md5], pfn) {
|
||||
sameFiles[md5] = append(sameFiles[md5], pfn)
|
||||
}
|
||||
sameFiles[md5] = append(sameFiles[md5], ln)
|
||||
}
|
||||
pf = out
|
||||
pfn = ln
|
||||
}
|
||||
}
|
||||
|
||||
// generate convert license name map
|
||||
convertLicenseName := make(map[string]string)
|
||||
for _, fileNames := range sameFiles {
|
||||
key := getLicenseKey(fileNames)
|
||||
for _, fileName := range fileNames {
|
||||
convertLicenseName[fileName] = key
|
||||
}
|
||||
}
|
||||
// save convert license name map to file
|
||||
bytes, err := json.Marshal(convertLicenseName)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to create json bytes. %s", err)
|
||||
return
|
||||
}
|
||||
// TODO change the path
|
||||
path := "options/convertLicenseName"
|
||||
out, err := os.Create(path)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to create new file. %s", err)
|
||||
}
|
||||
defer out.Close()
|
||||
_, err = out.Write(bytes)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to write %s. %s", path, err)
|
||||
}
|
||||
|
||||
fmt.Println("Done")
|
||||
}
|
||||
|
||||
// getSameFileMD5 returns md5 of the input file, if the content of input files are same
|
||||
func getSameFileMD5(f1, f2 *os.File) (string, error) {
|
||||
if f1 == nil || f2 == nil {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
// check file size
|
||||
fs1, err := f1.Stat()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
fs2, err := f2.Stat()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if fs1.Size() != fs2.Size() {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
// check content
|
||||
var chunkSize = 1024
|
||||
_, err = f1.Seek(0, 0)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
_, err = f2.Seek(0, 0)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var totalBytes []byte
|
||||
for {
|
||||
b1 := make([]byte, chunkSize)
|
||||
_, err1 := f1.Read(b1)
|
||||
|
||||
b2 := make([]byte, chunkSize)
|
||||
_, err2 := f2.Read(b2)
|
||||
|
||||
totalBytes = append(totalBytes, b1...)
|
||||
|
||||
if err1 != nil || err2 != nil {
|
||||
if err1 == io.EOF && err2 == io.EOF {
|
||||
md5 := md5.Sum(totalBytes)
|
||||
return string(md5[:]), nil
|
||||
} else if err1 == io.EOF || err2 == io.EOF {
|
||||
return "", nil
|
||||
} else if err1 != nil {
|
||||
return "", err1
|
||||
} else if err2 != nil {
|
||||
return "", err2
|
||||
}
|
||||
}
|
||||
|
||||
if !bytes.Equal(b1, b2) {
|
||||
return "", nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func getLicenseKey(fnl []string) string {
|
||||
if len(fnl) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
shortestItem := func(list []string) string {
|
||||
s := list[0]
|
||||
for _, l := range list[1:] {
|
||||
if len(l) < len(s) {
|
||||
s = l
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
allHasPrefix := func(list []string, s string) bool {
|
||||
for _, l := range list {
|
||||
if !strings.HasPrefix(l, s) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
sl := shortestItem(fnl)
|
||||
slv := strings.Split(sl, "-")
|
||||
var result string
|
||||
for i := len(slv); i >= 0; i-- {
|
||||
result = strings.Join(slv[:i], "-")
|
||||
if allHasPrefix(fnl, result) {
|
||||
return result
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func contains(s []string, e string) bool {
|
||||
for _, a := range s {
|
||||
if a == e {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
@ -7,6 +7,7 @@ import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
@ -23,27 +24,56 @@ import (
|
||||
var classifier *licenseclassifier.Classifier
|
||||
|
||||
func init() {
|
||||
err := InitClassifier()
|
||||
cln, err := getConvertLicenseName()
|
||||
if err != nil {
|
||||
log.Error("getConvertLicenseName: %v", err)
|
||||
}
|
||||
err = initClassifier(cln)
|
||||
if err != nil {
|
||||
log.Error("initClassifier: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func InitClassifier() error {
|
||||
func getConvertLicenseName() (map[string]string, error) {
|
||||
data, err := options.AssetFS().ReadFile("", "convertLicenseName")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var convertLicenseName map[string]string
|
||||
err = json.Unmarshal([]byte(data), &convertLicenseName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return convertLicenseName, nil
|
||||
}
|
||||
func initClassifier(convertLicenseName map[string]string) error {
|
||||
// threshold should be 0.84~0.86 or the test will be failed
|
||||
// TODO: add threshold to app.ini
|
||||
classifier = licenseclassifier.NewClassifier(.9)
|
||||
classifier = licenseclassifier.NewClassifier(.85)
|
||||
licenseFiles, err := options.AssetFS().ListFiles("license", true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
licenseVariantCount := make(map[string]int)
|
||||
if len(licenseFiles) > 0 {
|
||||
for _, lf := range licenseFiles {
|
||||
data, err := options.License(lf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
classifier.AddContent("License", lf, "license", data)
|
||||
variant := lf
|
||||
if convertLicenseName != nil {
|
||||
v, ok := convertLicenseName[lf]
|
||||
if ok {
|
||||
variant = v
|
||||
}
|
||||
licenseVariantCount[variant]++
|
||||
if licenseVariantCount[variant] > 1 {
|
||||
continue
|
||||
}
|
||||
}
|
||||
classifier.AddContent("License", lf, variant, data)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
@ -244,21 +274,10 @@ func detectLicense(buf []byte) []string {
|
||||
}
|
||||
|
||||
matches := classifier.Match(buf)
|
||||
licenseVariants := make(map[string][]string, len(matches.Matches))
|
||||
var results []string
|
||||
for _, r := range matches.Matches {
|
||||
if r.MatchType == "License" {
|
||||
tag := fmt.Sprintf("%d-%d", r.StartLine, r.EndLine)
|
||||
licenseVariants[tag] = append(licenseVariants[tag], r.Name)
|
||||
}
|
||||
}
|
||||
|
||||
var results []string
|
||||
for _, licenses := range licenseVariants {
|
||||
if len(licenses) == 1 {
|
||||
results = append(results, licenses[0])
|
||||
} else {
|
||||
// TODO: reslove license detection conflict
|
||||
results = append(results, licenses...)
|
||||
results = append(results, r.Variant)
|
||||
}
|
||||
}
|
||||
return results
|
||||
|
@ -202,6 +202,8 @@ func Test_detectLicense(t *testing.T) {
|
||||
}
|
||||
|
||||
LoadRepoConfig()
|
||||
convertLicenseName, err := getConvertLicenseName()
|
||||
assert.NoError(t, err)
|
||||
for _, licenseName := range Licenses {
|
||||
license, err := getLicense(licenseName, &licenseValues{
|
||||
Owner: "Gitea",
|
||||
@ -211,20 +213,28 @@ func Test_detectLicense(t *testing.T) {
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
|
||||
variant := licenseName
|
||||
if convertLicenseName != nil {
|
||||
v, ok := convertLicenseName[licenseName]
|
||||
if ok {
|
||||
variant = v
|
||||
}
|
||||
}
|
||||
tests = append(tests, DetectLicenseTest{
|
||||
name: fmt.Sprintf("auto single license test: %s", licenseName),
|
||||
arg: license,
|
||||
want: []string{licenseName},
|
||||
want: []string{variant},
|
||||
})
|
||||
}
|
||||
|
||||
tests = append(tests, DetectLicenseTest{
|
||||
name: fmt.Sprintf("auto multiple license test: %s and %s", tests[2].want[0], tests[3].want[0]),
|
||||
arg: append(tests[2].arg, tests[3].arg...),
|
||||
// TODO doesn't depend on the order
|
||||
want: []string{"389-exception", "0BSD"},
|
||||
})
|
||||
|
||||
err := InitClassifier()
|
||||
err = initClassifier(convertLicenseName)
|
||||
assert.NoError(t, err)
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
|
Loading…
Reference in New Issue
Block a user