-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocessText.go
More file actions
77 lines (63 loc) · 1.75 KB
/
processText.go
File metadata and controls
77 lines (63 loc) · 1.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
package main
import (
"bufio"
"os"
"path/filepath"
"regexp"
"strings"
)
func (w2v *Word2Vec) preprocessText(folderPath string) (allWords []string, err error) {
// Initialize a map to track unique words from Vocab for faster lookup
vocabSet := make(map[string]struct{})
for _, word := range w2v.Vocab {
vocabSet[word] = struct{}{} // Populate vocabSet with words from Vocab
}
// Loop through all .txt files in the folder
err = filepath.Walk(folderPath, func(filePath string, info os.FileInfo, err error) error {
// Skip directories and non .txt files
if err != nil || info.IsDir() || filepath.Ext(filePath) != ".txt" {
return nil
}
// Open the file
file, err := os.Open(filePath)
if err != nil {
return err
}
defer file.Close()
// Regular expression to remove punctuation
re := regexp.MustCompile(`[^\w\s]`)
// Create a scanner to read the file line by line
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
line = strings.ReplaceAll(line, "\n", " ")
line = strings.ReplaceAll(line, "\r", " ")
line = re.ReplaceAllString(line, "")
line = strings.ToLower(line)
line = strings.TrimSpace(line)
w2v.M.Lock()
// Tokenize the cleaned line into words
for _, word := range strings.Fields(line) {
if word != "" {
if _, exists := vocabSet[word]; !exists {
continue
}
w2v.WordFrequency[word]++ // Increment word frequency
// Append to allWords slice
allWords = append(allWords, word)
}
}
w2v.M.Unlock()
}
// Check for errors while scanning the file
if err := scanner.Err(); err != nil {
return err
}
return nil
})
// Return any errors encountered during the folder scan
if err != nil {
return nil, err
}
return allWords, nil
}