Skip to content
2 changes: 1 addition & 1 deletion cmd/license-detector/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func process(arg string) ([]match, error) {
return nil, err
}

ls, err := licensedb.Detect(resolvedFiler)
ls, _, err := licensedb.Detect(resolvedFiler)
if err != nil {
return nil, err
}
Expand Down
2 changes: 1 addition & 1 deletion licensedb/dataset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ func TestDataset(t *testing.T) {
for _, project := range projects {
go func(project filer.File) {
defer wg.Done()
myLicenses, _ := Detect(filer.NestFiler(rootFiler, project.Name))
myLicenses, _, _ := Detect(filer.NestFiler(rootFiler, project.Name))
if len(myLicenses) > 0 {
mutex.Lock()
licenses[project.Name] = myLicenses
Expand Down
25 changes: 25 additions & 0 deletions licensedb/internal/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -460,3 +460,28 @@ func tfidf(freq int, docfreq int, ndocs int) float32 {
}
return weight
}

func (db *database) QuerySourceFile(text string) map[string]float32 {
candidates := map[string]float32{}
append := func(others map[string]float32) {
for key, val := range others {
if candidates[key] < val {
candidates[key] = val
}
}
}
append(db.QueryLicenseText(string(text)))
// if len(candidates) == 0 {
// append(investigateSourceFile(text, db.nameSubstrings, db.nameSubstringSizes))
// if len(candidates) == 0 {
// append(investigateSourceFile(text, db.nameShortSubstrings, db.nameShortSubstringSizes))
// }
// }
if db.debug {
for key, val := range candidates {
println("NLP", key, val)
}
}
db.addURLMatches(candidates, text)
return candidates
}
114 changes: 114 additions & 0 deletions licensedb/internal/investigation.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"gopkg.in/src-d/go-license-detector.v2/licensedb/filer"
"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/processors"
"gopkg.in/src-d/enry.v1"
)

var (
Expand Down Expand Up @@ -62,6 +63,36 @@ var (

licenseDirectoryRe = regexp.MustCompile(fmt.Sprintf(
"^(%s)$", strings.Join(licenseFileNames, "|")))

commentSyntaxesRe = map[string]*regexp.Regexp {
"ANTLR": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"C#": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"CSS": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`),
"Go": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"HTML": regexp.MustCompile(`<\!--(.*?\t?\r?\n?)+?-->`),
"Haskel": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\{-(.*?\t?\r?\n?)+?\-\})`),
"Java": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"JavaScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Matlab": regexp.MustCompile(`(%.*\t?\r?\n?)|(%\{(.?\t?\r?\n?)+?%\})`),
"Objective-C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Perl": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=cut)`),
"PHP": regexp.MustCompile(`(#.*\t?\r?\n?)|(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Python": regexp.MustCompile("('''(.?\t?\r?\n?)+?''')|(#.*\t?\r?\n?)|(\"\"\"(.?\t?\r?\n?)+?\"\"\")"),
"Ruby": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=end)`),
"Rust": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`),
"R": regexp.MustCompile(`#.*\t?\r?\n?`),
"Shell": regexp.MustCompile(`#.*\t?\r?\n?`),
"Swift": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"SAS": regexp.MustCompile(`(\*(.*?\t?\r?\n?)+?;)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"Scala": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"SQL": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"TypeScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
"YAML": regexp.MustCompile(`#.*\t?\r?\n?`),
}

cleanCommentsRe = regexp.MustCompile(`#|\*|\/|=begin|=cut|=end`)
)

// ExtractLicenseFiles returns the list of possible license texts.
Expand Down Expand Up @@ -157,3 +188,86 @@ func InvestigateReadmeText(text []byte, fs filer.Filer) map[string]float32 {
func IsLicenseDirectory(fileName string) bool {
return licenseDirectoryRe.MatchString(strings.ToLower(fileName))
}

// ExtractSourceFiles searches for source code files and their returns header comments, when available.
// Enry is used to get possible valuable files.
func ExtractSourceFiles(files []string, fs filer.Filer) ([][]byte, []string) {
candidates := [][]byte{}
fileNames := []string{}
langs := []string{}
commentsFileName := []string{}
for _, file := range files {
text, err := fs.ReadFile(file)
if err == nil {
lang := enry.GetLanguage(file, text)
langs = append(langs, lang)
candidates = append(candidates, text)
fileNames = append(fileNames, file)
}
}
if len(candidates) > 0 {
candidates, commentsFileName = ExtractHeaderComments(candidates, langs, fileNames)
}
return candidates, commentsFileName
}

// ExtractHeaderComments searches in source code files for header comments and outputs license text on them them.
func ExtractHeaderComments(candidates [][]byte, langs []string, fileNames []string) ([][]byte, []string) {
comments := [][]byte{}
commentsFileName := []string{}
var unsupportedTypes string
for i, candidate := range candidates {
candidateLang := langs[i]
if reg, exists := commentSyntaxesRe[candidateLang]; exists {
candidateHeader := candidate
if len(candidateHeader) > 1024 {
candidateHeader = candidate[:1024]
}
if match := reg.FindAllString(string(candidateHeader), -1); match != nil {
commentsFileName = append(commentsFileName, fileNames[i])
var matchText string
for _, m := range match {
matchText += cleanCommentsRe.ReplaceAllString(m, "")
}
comments = append(comments, []byte(matchText))
}
} else {
match, _ := regexp.Match(candidateLang, []byte(unsupportedTypes))
if match == false {
unsupportedTypes += candidateLang + ", "
}
}
}
if len(unsupportedTypes) > 0 {
unsupportedTypes = unsupportedTypes[:len(unsupportedTypes)-2]
fmt.Println("The following file types were not investigated for licenses on the comments:", unsupportedTypes + ". ")
}
return comments, commentsFileName
}

// InvestigateHeaderComments scans the header comments for licensing information and outputs the
// probable names using NER.
func InvestigateHeaderComments(texts [][]byte, fs filer.Filer, commentsFileName []string) (map[string]float32, []string) {
maxLicenses := map[string]float32{}
licensesFileNames := []string{}
// TO DO: output max license per file, not files with licenses + licenses found
for i, text := range texts {
candidates := InvestigateHeaderComment(text)
if len(candidates) > 0 {
licensesFileNames = append(licensesFileNames, commentsFileName[i])
for name, sim := range candidates {
maxSim := maxLicenses[name]
if sim > maxSim {
maxLicenses[name] = sim
}
}
}
}
return maxLicenses, licensesFileNames
}

// InvestigateHeaderComment scans the header comments for licensing information and outputs probable
// names found with Named Entity Recognition from NLP.
func InvestigateHeaderComment(text []byte) map[string]float32 {
return globalLicenseDatabase().QuerySourceFile(string(text))
}
11 changes: 11 additions & 0 deletions licensedb/internal/nlp.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,14 @@ func splitLicenseName(name string) []substring {
})
return result
}

func investigateSourceFile(
text string, licenseNameParts map[string][]substring,
licenseNameSizes map[string]int) map[string]float32 {
// TO DO: split license-comments from description-comments
// =====
// ----
// \n\n\n
// import
return map[string]float32{}
}
43 changes: 35 additions & 8 deletions licensedb/licensedb.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ var (

// Detect returns the most probable reference licenses matched for the given
// file tree. Each match has the confidence assigned, from 0 to 1, 1 means 100% confident.
func Detect(fs filer.Filer) (map[string]float32, error) {
func Detect(fs filer.Filer) (map[string]float32, []string, error) {
files, err := fs.ReadDir("")
if err != nil {
return nil, err
return nil, nil, err
}
fileNames := []string{}
for _, file := range files {
Expand All @@ -39,16 +39,43 @@ func Detect(fs filer.Filer) (map[string]float32, error) {
candidates := internal.ExtractLicenseFiles(fileNames, fs)
licenses := internal.InvestigateLicenseTexts(candidates)
if len(licenses) > 0 {
return licenses, nil
return licenses, nil, nil
}
// Plan B: take the README, find the section about the license and apply NER
candidates = internal.ExtractReadmeFiles(fileNames, fs)
if len(candidates) == 0 {
return nil, ErrNoLicenseFound
if len(candidates) > 0 {
licenses = internal.InvestigateReadmeTexts(candidates, fs)
if len(licenses) > 0 {
return licenses, nil, nil
}
}

// Plan C: look for licence texts in source code files with comments at header
extendedFileNames := []string{}
commentsFileName := []string{}
licensesFileNames := []string{}
extendedFileNames = extractAllSubfiles(fs, extendedFileNames, "")
candidates, commentsFileName = internal.ExtractSourceFiles(extendedFileNames, fs)
if len(candidates) > 0 {
licenses, licensesFileNames = internal.InvestigateHeaderComments(candidates, fs, commentsFileName)
}
licenses = internal.InvestigateReadmeTexts(candidates, fs)
if len(licenses) == 0 {
return nil, ErrNoLicenseFound
return nil, nil, ErrNoLicenseFound
}
return licenses, licensesFileNames, nil
}

func extractAllSubfiles(fs filer.Filer, fileNames []string, path string) []string {
files, err := fs.ReadDir(path)
if err == nil {
for _, subfile := range files {
currentPath := paths.Join(path, subfile.Name)
if subfile.IsDir {
fileNames = extractAllSubfiles(fs, fileNames, currentPath)
} else {
fileNames = append(fileNames, currentPath)
}
}
}
return licenses, nil
return fileNames
}