src-d · merybenavente · Jul 17, 2018 · Jul 18, 2018 · Jul 20, 2018 · Jul 20, 2018
diff --git a/cmd/license-detector/main.go b/cmd/license-detector/main.go
@@ -103,7 +103,7 @@ func process(arg string) ([]match, error) {
 		return nil, err
 	}
 
-	ls, err := licensedb.Detect(resolvedFiler)
+	ls, _, err := licensedb.Detect(resolvedFiler)
 	if err != nil {
 		return nil, err
 	}

diff --git a/licensedb/dataset_test.go b/licensedb/dataset_test.go
@@ -23,7 +23,7 @@ func TestDataset(t *testing.T) {
 	for _, project := range projects {
 		go func(project filer.File) {
 			defer wg.Done()
-			myLicenses, _ := Detect(filer.NestFiler(rootFiler, project.Name))
+			myLicenses, _, _ := Detect(filer.NestFiler(rootFiler, project.Name))
 			if len(myLicenses) > 0 {
 				mutex.Lock()
 				licenses[project.Name] = myLicenses

diff --git a/licensedb/internal/db.go b/licensedb/internal/db.go
@@ -460,3 +460,28 @@ func tfidf(freq int, docfreq int, ndocs int) float32 {
 	}
 	return weight
 }
+
+func (db *database) QuerySourceFile(text string) map[string]float32 {
+	candidates := map[string]float32{}
+	append := func(others map[string]float32) {
+		for key, val := range others {
+			if candidates[key] < val {
+				candidates[key] = val
+			}
+		}
+	}
+	append(db.QueryLicenseText(string(text)))
+	// if len(candidates) == 0 {
+	// 	append(investigateSourceFile(text, db.nameSubstrings, db.nameSubstringSizes))
+	// 	if len(candidates) == 0 {
+	// 		append(investigateSourceFile(text, db.nameShortSubstrings, db.nameShortSubstringSizes))
+	// 	}
+	// }
+	if db.debug {
+		for key, val := range candidates {
+			println("NLP", key, val)
+		}
+	}
+	db.addURLMatches(candidates, text)
+	return candidates
+}
diff --git a/licensedb/internal/investigation.go b/licensedb/internal/investigation.go
@@ -10,6 +10,7 @@ import (
 
 	"gopkg.in/src-d/go-license-detector.v2/licensedb/filer"
 	"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/processors"
+	"gopkg.in/src-d/enry.v1"
 )
 
 var (
@@ -62,6 +63,36 @@ var (
 
 	licenseDirectoryRe = regexp.MustCompile(fmt.Sprintf(
 		"^(%s)$", strings.Join(licenseFileNames, "|")))
+
+	commentSyntaxesRe = map[string]*regexp.Regexp {
+		"ANTLR": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"C++": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"C#": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"CSS": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`),
+		"Go": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"HTML": regexp.MustCompile(`<\!--(.*?\t?\r?\n?)+?-->`),
+		"Haskel": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\{-(.*?\t?\r?\n?)+?\-\})`),
+		"Java": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"JavaScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"Matlab": regexp.MustCompile(`(%.*\t?\r?\n?)|(%\{(.?\t?\r?\n?)+?%\})`),
+		"Objective-C": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"Perl": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=cut)`),
+		"PHP": regexp.MustCompile(`(#.*\t?\r?\n?)|(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"Python": regexp.MustCompile("('''(.?\t?\r?\n?)+?''')|(#.*\t?\r?\n?)|(\"\"\"(.?\t?\r?\n?)+?\"\"\")"),
+		"Ruby": regexp.MustCompile(`(#.*\t?\r?\n?)|(=begin(.*?\t?\r?\n?)+?=end)`),
+		"Rust": regexp.MustCompile(`\/\*(.*?\t?\r?\n?)+?\*\/`),
+		"R": regexp.MustCompile(`#.*\t?\r?\n?`),
+		"Shell": regexp.MustCompile(`#.*\t?\r?\n?`),
+		"Swift": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"SAS": regexp.MustCompile(`(\*(.*?\t?\r?\n?)+?;)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"Scala": regexp.MustCompile(`(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"SQL": regexp.MustCompile(`(-{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"TypeScript": regexp.MustCompile(`(\/{2}.*\t?\r?\n?)|(\/\*(.*?\t?\r?\n?)+?\*\/)`),
+		"YAML": regexp.MustCompile(`#.*\t?\r?\n?`),
+	}
+
+	cleanCommentsRe = regexp.MustCompile(`#|\*|\/|=begin|=cut|=end`)
 )
 
 // ExtractLicenseFiles returns the list of possible license texts.
@@ -157,3 +188,86 @@ func InvestigateReadmeText(text []byte, fs filer.Filer) map[string]float32 {
 func IsLicenseDirectory(fileName string) bool {
 	return licenseDirectoryRe.MatchString(strings.ToLower(fileName))
 }
+
+// ExtractSourceFiles searches for source code files and their returns header comments, when available.
+// Enry is used to get possible valuable files.
+func ExtractSourceFiles(files []string, fs filer.Filer) ([][]byte, []string) {
+	candidates := [][]byte{}
+	fileNames := []string{}
+	langs := []string{}
+	commentsFileName := []string{}
+	for _, file := range files {
+		text, err := fs.ReadFile(file)
+		if err == nil {
+			lang := enry.GetLanguage(file, text)
+			langs = append(langs, lang)
+			candidates = append(candidates, text)
+			fileNames = append(fileNames, file)
+		}
+	}
+	if len(candidates) > 0 {
+		candidates, commentsFileName = ExtractHeaderComments(candidates, langs, fileNames)
+	}
+	return candidates, commentsFileName
+}
+
+// ExtractHeaderComments searches in source code files for header comments and outputs license text on them them.
+func ExtractHeaderComments(candidates [][]byte, langs []string, fileNames []string) ([][]byte, []string) {
+	comments := [][]byte{}
+	commentsFileName := []string{}
+	var unsupportedTypes string
+	for i, candidate := range candidates {
+		candidateLang := langs[i]
+		if reg, exists := commentSyntaxesRe[candidateLang]; exists {
+			candidateHeader := candidate
+			if len(candidateHeader) > 1024 {
+				candidateHeader = candidate[:1024]
+			}
+			if match := reg.FindAllString(string(candidateHeader), -1); match != nil {
+				commentsFileName = append(commentsFileName, fileNames[i])
+				var matchText string
+				for _, m := range match {
+					matchText += cleanCommentsRe.ReplaceAllString(m, "")
+				}
+				comments = append(comments, []byte(matchText))
+			}
+		} else {
+			match, _ := regexp.Match(candidateLang, []byte(unsupportedTypes))
+			if match == false {
+				unsupportedTypes += candidateLang + ", "
+			}
+		}
+	}
+	if len(unsupportedTypes) > 0 {
+		unsupportedTypes = unsupportedTypes[:len(unsupportedTypes)-2]
+		fmt.Println("The following file types were not investigated for licenses on the comments:", unsupportedTypes + ". ")
+	}
+	return comments, commentsFileName
+}
+
+// InvestigateHeaderComments scans the header comments for licensing information and outputs the
+// probable names using NER.
+func InvestigateHeaderComments(texts [][]byte, fs filer.Filer, commentsFileName []string) (map[string]float32, []string) {
+	maxLicenses := map[string]float32{}
+	licensesFileNames := []string{}
+	// TO DO: output max license per file, not files with licenses + licenses found
+	for i, text := range texts {
+		candidates := InvestigateHeaderComment(text)
+		if len(candidates) > 0 {
+			licensesFileNames = append(licensesFileNames, commentsFileName[i])
+			for name, sim := range candidates {
+				maxSim := maxLicenses[name]
+				if sim > maxSim {
+					maxLicenses[name] = sim
+				}
+			}
+		}
+	}
+	return maxLicenses, licensesFileNames
+}
+
+// InvestigateHeaderComment scans the header comments for licensing information and outputs probable
+// names found with Named Entity Recognition from NLP.
+func InvestigateHeaderComment(text []byte) map[string]float32 {
+	return globalLicenseDatabase().QuerySourceFile(string(text))
+}
diff --git a/licensedb/internal/nlp.go b/licensedb/internal/nlp.go
@@ -143,3 +143,14 @@ func splitLicenseName(name string) []substring {
 	})
 	return result
 }
+
+func investigateSourceFile(
+	text string, licenseNameParts map[string][]substring,
+	licenseNameSizes map[string]int) map[string]float32 {
+	// TO DO: split license-comments from description-comments
+			// =====
+			// ----
+			// \n\n\n
+			// import
+	return map[string]float32{}
+	}
diff --git a/licensedb/licensedb.go b/licensedb/licensedb.go
@@ -15,10 +15,10 @@ var (
 
 // Detect returns the most probable reference licenses matched for the given
 // file tree. Each match has the confidence assigned, from 0 to 1, 1 means 100% confident.
-func Detect(fs filer.Filer) (map[string]float32, error) {
+func Detect(fs filer.Filer) (map[string]float32, []string, error) {
 	files, err := fs.ReadDir("")
 	if err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 	fileNames := []string{}
 	for _, file := range files {
@@ -39,16 +39,43 @@ func Detect(fs filer.Filer) (map[string]float32, error) {
 	candidates := internal.ExtractLicenseFiles(fileNames, fs)
 	licenses := internal.InvestigateLicenseTexts(candidates)
 	if len(licenses) > 0 {
-		return licenses, nil
+		return licenses, nil, nil
 	}
 	// Plan B: take the README, find the section about the license and apply NER
 	candidates = internal.ExtractReadmeFiles(fileNames, fs)
-	if len(candidates) == 0 {
-		return nil, ErrNoLicenseFound
+	if len(candidates) > 0 {
+		licenses = internal.InvestigateReadmeTexts(candidates, fs)
+		if len(licenses) > 0 {
+			return licenses, nil, nil
+		}
+	}
+
+	// Plan C: look for licence texts in source code files with comments at header
+	extendedFileNames := []string{}
+	commentsFileName := []string{}
+	licensesFileNames := []string{}
+	extendedFileNames = extractAllSubfiles(fs, extendedFileNames, "")
+	candidates, commentsFileName = internal.ExtractSourceFiles(extendedFileNames, fs)
+	if len(candidates) > 0 {
+		licenses, licensesFileNames = internal.InvestigateHeaderComments(candidates, fs, commentsFileName)
 	}
-	licenses = internal.InvestigateReadmeTexts(candidates, fs)
 	if len(licenses) == 0 {
-		return nil, ErrNoLicenseFound
+		return nil, nil, ErrNoLicenseFound
+	}
+	return licenses, licensesFileNames, nil
+}
+
+func extractAllSubfiles(fs filer.Filer, fileNames []string, path string) []string {
+	files, err := fs.ReadDir(path)
+	if err == nil {
+		for _, subfile := range files {
+			currentPath := paths.Join(path, subfile.Name)
+			if subfile.IsDir {
+				fileNames = extractAllSubfiles(fs, fileNames, currentPath)
+			} else {
+				fileNames = append(fileNames, currentPath)
+			}
+		}
 	}
-	return licenses, nil
+	return fileNames
 }