diff --git a/tokenizers/all-file-level/c.ini b/tokenizers/all-file-level/c.ini new file mode 100644 index 000000000..25337485f --- /dev/null +++ b/tokenizers/all-file-level/c.ini @@ -0,0 +1,6 @@ +[Language] +separators = ; :: . -> [ ] ( ) ++ -- ~ ! - + & * .* ->* * / % << >> < > <= >= ++ != & ^ | && || ? == { } = # , " \\ : $ +file_extensions = .cpp .hpp .c .h .C .cc .CPP .c++ .cp +comment_inline = // +comment_open_tag = /* +comment_close_tag = */ diff --git a/tokenizers/all-file-level/config.ini b/tokenizers/all-file-level/config.ini new file mode 100644 index 000000000..be8e9b351 --- /dev/null +++ b/tokenizers/all-file-level/config.ini @@ -0,0 +1,13 @@ +[Main] +N_PROCESSES = 4 +language_file = c.ini + +[Folders/Files] +PATH_proj_paths = project-list.txt +PATH_tokens_folder = tokens +PATH_bookkeeping_file_folder = bookkeeping_files +PATH_bookkeeping_proj_folder = bookkeeping_projs +PATH_projects_success = projects_success.txt +PATH_project_starting_index = project_starting_index.txt +PATH_projects_fail = projects_fail.txt +PATH_mirror_repo = mirror_repo diff --git a/tokenizers/all-file-level/create_repo_tree.py b/tokenizers/all-file-level/src/create_repo_tree.py similarity index 100% rename from tokenizers/all-file-level/create_repo_tree.py rename to tokenizers/all-file-level/src/create_repo_tree.py diff --git a/tokenizers/all-file-level/src/tokenizer-directory.py b/tokenizers/all-file-level/src/tokenizer-directory.py new file mode 100644 index 000000000..74ec7096e --- /dev/null +++ b/tokenizers/all-file-level/src/tokenizer-directory.py @@ -0,0 +1 @@ +import logging from multiprocessing import Process, Value, Lock import re import os import collections from lockfile import LockFile try: from configparser import ConfigParser except ImportError: from ConfigParser import ConfigParser # ver. < 3.0 # Logging code FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s' logging.basicConfig(level=logging.DEBUG,format=FORMAT) file_handler = logging.FileHandler('results.log') file_handler.setFormatter(logging.Formatter(FORMAT)) logging.getLogger().addHandler(file_handler) config_file = 'config.ini' # instantiate config = ConfigParser() # parse existing file try: config.read(config_file) except IOError: print 'config settings not fould' logging.error('Config file ['+config_file+'] not found') sys.exit() # Provided by the user N_PROCESSES = config.getint('Main', 'N_PROCESSES') language_file = config.get('Main', 'language_file') # Folders PATH_proj_paths = config.get('Folders/Files', 'PATH_proj_paths') PATH_tokens_folder = config.get('Folders/Files', 'PATH_tokens_folder') PATH_bookkeeping_file_folder = config.get('Folders/Files', 'PATH_bookkeeping_file_folder') PATH_bookkeeping_proj_folder = config.get('Folders/Files', 'PATH_bookkeeping_proj_folder') PATH_projects_success = config.get('Folders/Files', 'PATH_projects_success') PATH_project_starting_index = config.get('Folders/Files', 'PATH_project_starting_index') PATH_projects_fail = config.get('Folders/Files', 'PATH_projects_fail') try: config.read(language_file) except IOError: print 'Language settings not fould' logging.error('Language settings ['+language_file+'] not found') sys.exit() # Read language settings separators = config.get('Language', 'separators').split(' ') file_extensions = config.get('Language', 'file_extensions').split(' ') comment_end_of_line = config.get('Language', 'comment_inline') comment_open_tag = re.escape(config.get('Language', 'comment_open_tag')) comment_close_tag = re.escape(config.get('Language', 'comment_close_tag')) def tokenizer(proj_id, proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj): if not os.path.exists(proj_path): logging.error('Project not found <'+proj_id+','+proj_path+'>') # Important to have a global lock on this file because it is shared lock = LockFile(PATH_projects_fail) with lock: with open(PATH_projects_fail,'a+') as project_fail: project_fail.write(proj_path+'\n') return # In case process names need to be logged # process_name = '['+mp.current_process().name+'] ' # logging.info(process_name+'Starting proj <'+proj_id+','+proj_path+'>') all_files = [] for (dirpath, dirnames, filenames) in os.walk(proj_path): aux_list = [] for extension in file_extensions: aux = [x for x in filenames if x.endswith(extension)] #aux = [os.path.join(dirpath,x) for x in aux] aux_list.extend(aux) all_files.extend(aux_list) # Increment the shared file_starting_id by the amount of files in the current project lock = Lock() with lock: all_files = zip(range(file_starting_id.value, file_starting_id.value+len(all_files)), all_files) file_starting_id.value += len(all_files) for file_id, file_path in all_files: file_path = proj_path+'/'+file_path logging.info('Starting file <'+proj_id+','+str(file_id)+','+file_path+'>') try: with open(file_path,'r') as myfile: file_string = myfile.read() except IOError: logging.error('File not found <'+proj_id+','+str(file_id)+','+file_path+'>') continue # Remove enf of line comments file_string = re.sub(comment_end_of_line+'.*?\n','',file_string,flags=re.DOTALL) # Remove tagged comments file_string = re.sub(comment_open_tag+'.*?'+comment_close_tag,'',file_string,flags=re.DOTALL) #Transform separators into spaces (remove them) for x in separators: file_string = file_string.replace(x,' ') #Create a list of tokens file_string = file_string.split() # Total number of tokens tokens_count_total = len(file_string) #Count occurrences file_string = collections.Counter(file_string) #Converting Counter to dict because according to StackOverflow is better file_string=dict(file_string) # Unique number of tokens tokens_count_unique = len(file_string) tokens = [] #SourcererCC formatting for k, v in file_string.items(): tokens.append(k+'@@::@@'+str(v)) tokens = ','.join(tokens) FILE_bookkeeping_file.write(proj_id+','+str(file_id)+','+file_path+'\n') FILE_tokens.write(proj_id+','+str(file_id)+','+str(tokens_count_total)+','+str(tokens_count_unique)+'@#@'+tokens+'\n') FILE_bookkeeping_proj.write(proj_id+','+proj_path+'\n') # Important to have a global loc on this file because it is shared lock = LockFile(PATH_projects_success) with lock: with open(PATH_projects_success,'a+') as project_success: project_success.write(proj_path+'\n') def tokenize(list_projects, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id): # Each tokenize will represent a new process with open(FILE_tokens_name, 'w') as FILE_tokens, \ open(FILE_bookkeeping_file_name, 'w') as FILE_bookkeeping_file, \ open(FILE_bookkeeping_proj_name, 'w') as FILE_bookkeeping_proj: for proj_id, proj_path in list_projects: tokenizer(str(proj_id), proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj) if __name__ == '__main__': #In the main file we: # create directories if they do not exist # read list of PATH_projects_success, if exists, and do not process these again # each process needs a unique file with tokens and file and project # bookkeeping in the proper folders # start N_PROCESSES, and give them [(unique_id, proj_path)] if not os.path.exists(PATH_tokens_folder): os.makedirs(PATH_tokens_folder) if not os.path.exists(PATH_bookkeeping_file_folder): os.makedirs(PATH_bookkeeping_file_folder) if not os.path.exists(PATH_bookkeeping_proj_folder): os.makedirs(PATH_bookkeeping_proj_folder) proj_paths = [] with open(PATH_proj_paths) as f: for line in f: proj_paths.append(line.strip('\n')) projects_success = [] try: with open(PATH_projects_success,'r') as f: for line in f: projects_success.append(line.strip().strip('\n')) except IOError as e: logging.info('File '+PATH_projects_success+' no found') projects_starting_index = 0 proj_paths = list(set(proj_paths) - set(projects_success)) # Initialize projects_starting_index with previous logged number if not os.path.exists(PATH_project_starting_index): with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(len(proj_paths))+'\n') else: try: with open(PATH_project_starting_index, 'r') as FILE_project_starting_index: projects_starting_index = int(FILE_project_starting_index.readline().strip('\n')) except ValueError: projects_starting_index = 0 with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(projects_starting_index+len(proj_paths))+'\n') proj_paths = zip(range(projects_starting_index, len(proj_paths)+projects_starting_index),proj_paths) #Split list of projects into N_PROCESSES lists proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ] # Multiprocessing with N_PROCESSES processes = [] # Multiprocessing shared variable instance for recording file_id file_starting_id = Value('i', 0) process_num = 0 n =0 for input_process in proj_paths_list: # Skip empty sublists if len(input_process) == 0: continue process_num += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' while (os.path.isfile(FILE_tokens_name) and os.path.isfile(FILE_bookkeeping_file_name) and os.path.isfile(FILE_bookkeeping_proj_name)): n += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' n += 1 processes.append(Process(name='Process '+str(process_num), target=tokenize, args=(input_process, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id, ))) for proc in processes: proc.start() logging.info(proc.name) for proc in processes: proc.join() \ No newline at end of file diff --git a/tokenizers/all-file-level/tokenizer-muse.py b/tokenizers/all-file-level/src/tokenizer-muse.py similarity index 100% rename from tokenizers/all-file-level/tokenizer-muse.py rename to tokenizers/all-file-level/src/tokenizer-muse.py diff --git a/tokenizers/all-file-level/src/tokenizer-tar.py b/tokenizers/all-file-level/src/tokenizer-tar.py new file mode 100644 index 000000000..6869f086d --- /dev/null +++ b/tokenizers/all-file-level/src/tokenizer-tar.py @@ -0,0 +1,283 @@ +import logging +import multiprocessing as mp +from multiprocessing import Process +import re +import os +import collections +from lockfile import LockFile +import tarfile +import mimetypes +import sys +import hashlib + +try: + from configparser import ConfigParser +except ImportError: + from ConfigParser import ConfigParser # ver. < 3.0 + +# Logging code +FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s' +logging.basicConfig(level=logging.DEBUG,format=FORMAT) +file_handler = logging.FileHandler('results.log') +file_handler.setFormatter(logging.Formatter(FORMAT)) +logging.getLogger().addHandler(file_handler) + +config_file = 'config.ini' + +# instantiate +config = ConfigParser() + +# parse existing file +try: + config.read(config_file) +except IOError: + print 'config settings not fould' + logging.error('Config file ['+config_file+'] not found') + sys.exit() + +N_PROCESSES = config.getint('Main', 'N_PROCESSES') +language_file = config.get('Main', 'language_file') + +# Folders +PATH_proj_paths = config.get('Folders/Files', 'PATH_proj_paths') +PATH_tokens_folder = config.get('Folders/Files', 'PATH_tokens_folder') +PATH_bookkeeping_file_folder = config.get('Folders/Files', 'PATH_bookkeeping_file_folder') +PATH_bookkeeping_proj_folder = config.get('Folders/Files', 'PATH_bookkeeping_proj_folder') +PATH_projects_success = config.get('Folders/Files', 'PATH_projects_success') +PATH_project_starting_index = config.get('Folders/Files', 'PATH_project_starting_index') +PATH_projects_fail = config.get('Folders/Files', 'PATH_projects_fail') + +try: + config.read(language_file) +except IOError: + print 'Language settings not fould' + logging.error('Language settings ['+language_file+'] not found') + sys.exit() + +# Read language settings +separators = config.get('Language', 'separators').split(' ') +file_extensions = config.get('Language', 'file_extensions').split(' ') +comment_end_of_line = config.get('Language', 'comment_inline') +comment_open_tag = re.escape(config.get('Language', 'comment_open_tag')) +comment_close_tag = re.escape(config.get('Language', 'comment_close_tag')) + + +ALWAYS = ['@','@#@','@@::@@','#'] # These should be always part of the separators +separators.extend(ALWAYS) + +# Some of the files we found happen to be binary, even if we their extension is something +# line *.cpp. Therefore we explore a behavior of file(1) to find if these files are binary +# http://stackoverflow.com/questions/32184809/python-file1-why-are-the-numbers-7-8-9-10-12-13-27-and-range0x20-0x100 +textchars = bytearray({7,8,9,10,12,13,27} | set(range(0x20, 0x100)) - {0x7f}) +is_binary_string = lambda bytes: bool(bytes.translate(None, textchars)) + +def tokenizer(proj_id, proj_path, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name): + logging.info('Starting project <'+proj_id+','+proj_path+'>') + + if not os.path.isdir(proj_path): + logging.error('Unable to open project <'+proj_id+','+proj_path+'>') + lock = LockFile(PATH_projects_fail) + with lock: + with open(PATH_projects_fail,'a+') as project_failure: + project_failure.write(proj_path+'\n') + return + + # Search for all tar files + tar_files = [os.path.join(proj_path, f) for f in os.listdir(proj_path) if os.path.isfile(os.path.join(proj_path, f))] + tar_files = [f for f in tar_files if '_code' in f] + if(len(tar_files) != 1): + logging.error('Tar not found on <'+proj_id+','+proj_path+'>') + # Important to have a global loc on this file because it is shared + lock = LockFile(PATH_projects_fail) + with lock: + with open(PATH_projects_fail,'a+') as project_fail: + project_fail.write(proj_path+'\n') + return + + tar_file = tar_files[0] + + try: + with tarfile.open(tar_file,'r') as my_tar_file: + # Get all members on the tar file + all_files = [] + for member in my_tar_file.getmembers(): + all_files.append(member.name) + + # Filter them by the correct extension + aux = [] + for extension in file_extensions: + aux.extend([x for x in all_files if x.endswith(extension)]) + all_files = aux + + # This is very strange, but I did find some paths with newlines, + # so I am simply eliminatins these + all_files = [x for x in all_files if '\n' not in x] + + # In case process names need to be logged + # process_name = '['+mp.current_process().name+'] ' + + all_files = zip(range(0,len(all_files)),all_files) + + for file_id, file_path in all_files: + + logging.info('Starting file <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>') + + try: + myfile = my_tar_file.extractfile(file_path) + except: + logging.error('Unable to open file (1) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>') + break + + if myfile is None: + logging.error('Unable to open file (2) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>') + break + + file_string = myfile.read() + + if is_binary_string(file_string): + logging.error('Unable to open file (3) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>') + break + + # Remove enf of line comments + file_string = re.sub(comment_end_of_line+'.*?\n','',file_string,flags=re.DOTALL) + # Remove tagged comments + file_string = re.sub(comment_open_tag+'.*?'+comment_close_tag,'',file_string,flags=re.DOTALL) + #Transform separators into spaces (remove them) + for x in separators: + file_string = file_string.replace(x,' ') + #Create a list of tokens + file_string = file_string.split() + # Total number of tokens + tokens_count_total = len(file_string) + #Count occurrences + file_string = collections.Counter(file_string) + #Converting Counter to dict because according to StackOverflow is better + file_string=dict(file_string) + # Unique number of tokens + tokens_count_unique = len(file_string) + + tokens = [] + #SourcererCC formatting + for k, v in file_string.items(): + tokens.append(k+'@@::@@'+str(v)) + tokens = ','.join(tokens) + + # MD5 + m = hashlib.md5() + m.update(tokens) + + with open(FILE_tokens_name, 'a+') as FILE_tokens_file: + FILE_tokens_file.write(proj_id+','+str(file_id)+','+str(tokens_count_total)+','+str(tokens_count_unique)\ + +','+m.hexdigest()\ + +'@#@'+tokens+'\n') + + with open(FILE_bookkeeping_file_name, 'a+') as FILE_bookkeeping_file: + FILE_bookkeeping_file.write(proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'\n') + + except Exception: + logging.error('Unable to open tar on <'+proj_id+','+proj_path+'>') + lock = LockFile(PATH_projects_fail) + with lock: + with open(PATH_projects_fail,'a+') as project_failure: + project_failure.write(proj_path+'\n') + return + + with open(FILE_bookkeeping_proj_name, 'a+') as FILE_bookkeeping_proj: + FILE_bookkeeping_proj.write(proj_id+','+proj_path+'\n') + + # Important to have a global loc on this file because it is shared + lock = LockFile(PATH_projects_success) + with lock: + with open(PATH_projects_success,'a+') as project_success: + project_success.write(proj_path+'\n') + + logging.info('Project finished <'+proj_id+','+proj_path+'>') + + +def tokenize(list_projects, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name): + + # Each tokenize will represent a new process + for proj_id, proj_path in list_projects: + tokenizer(str(proj_id), proj_path, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name) + + +if __name__ == '__main__': + #In the main file we: + # create directories if they do not exist + # read list of PATH_projects_success, if exists, and do not process these again + # each process needs a unique file with tokens and file and project + # bookkeeping in the proper folders + # start N_PROCESSES, and give them [(unique_id, proj_path)] + + if not os.path.exists(PATH_tokens_folder): + os.makedirs(PATH_tokens_folder) + if not os.path.exists(PATH_bookkeeping_file_folder): + os.makedirs(PATH_bookkeeping_file_folder) + if not os.path.exists(PATH_bookkeeping_proj_folder): + os.makedirs(PATH_bookkeeping_proj_folder) + + proj_paths = [] + with open(PATH_proj_paths) as f: + for line in f: + proj_paths.append(line.strip('\n')) + + projects_success = [] + try: + with open(PATH_projects_success,'r') as f: + for line in f: + projects_success.append(line.strip().strip('\n')) + except IOError as e: + logging.info('File '+PATH_projects_success+' no found') + + projects_starting_index = 0 + proj_paths = list(set(proj_paths) - set(projects_success)) + + # Initialize projects_starting_index with previous logged number + if not os.path.exists(PATH_project_starting_index): + with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: + FILE_project_starting_index.write(str(len(proj_paths))+'\n') + else: + try: + with open(PATH_project_starting_index, 'r') as FILE_project_starting_index: + projects_starting_index = int(FILE_project_starting_index.readline().strip('\n')) + except ValueError: + projects_starting_index = 0 + + with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: + FILE_project_starting_index.write(str(projects_starting_index+len(proj_paths))+'\n') + + proj_paths = zip(range(projects_starting_index, len(proj_paths)+projects_starting_index),proj_paths) + + #Split list of projects into N_PROCESSES lists + proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ] + + # Multiprocessing with N_PROCESSES + processes = [] + process_num = 0 + n =0 + for input_process in proj_paths_list: + + # Skip empty sublists + if len(input_process) == 0: + continue + + process_num += 1 + FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' + FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' + FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' + + while (os.path.isfile(FILE_tokens_name) and os.path.isfile(FILE_bookkeeping_file_name) and os.path.isfile(FILE_bookkeeping_proj_name)): + n += 1 + FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' + FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' + FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' + + n += 1 + processes.append(Process(name='Process '+str(process_num), target=tokenize, args=(input_process, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name,))) + + for proc in processes: + proc.start() + logging.info(proc.name) + + for proc in processes: + proc.join() diff --git a/tokenizers/all-file-level/src/tokens-mirror.py b/tokenizers/all-file-level/src/tokens-mirror.py new file mode 100644 index 000000000..8adc9a6ad --- /dev/null +++ b/tokenizers/all-file-level/src/tokens-mirror.py @@ -0,0 +1,77 @@ +import logging +import multiprocessing as mp +from multiprocessing import Process +import re +import os +import collections +from lockfile import LockFile + +try: + from configparser import ConfigParser +except ImportError: + from ConfigParser import ConfigParser # ver. < 3.0 + +# Logging code +FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s' +logging.basicConfig(level=logging.DEBUG,format=FORMAT) +file_handler = logging.FileHandler('results.log') +file_handler.setFormatter(logging.Formatter(FORMAT)) +logging.getLogger().addHandler(file_handler) + +config_file = 'config.ini' + +# instantiate +config = ConfigParser() + +# parse existing file +try: + config.read(config_file) +except IOError: + print 'config settings not fould' + logging.error('Config file ['+config_file+'] not found') + sys.exit() + +# folders +PATH_tokens_folder = config.get('Folders/Files', 'PATH_tokens_folder') +PATH_bookkeeping_file_folder = config.get('Folders/Files', 'PATH_bookkeeping_file_folder') +PATH_bookkeeping_proj_folder = config.get('Folders/Files', 'PATH_bookkeeping_proj_folder') +PATH_TARGET = config.get('Folders/Files', 'PATH_mirror_repo') + +token_files = [f for f in os.listdir(PATH_tokens_folder) if os.path.isfile(os.path.join(PATH_tokens_folder, f))] +book_proj_files = [f for f in os.listdir(PATH_bookkeeping_proj_folder) if os.path.isfile(os.path.join(PATH_bookkeeping_proj_folder, f))] +book_files_files = [f for f in os.listdir(PATH_bookkeeping_file_folder) if os.path.isfile(os.path.join(PATH_bookkeeping_file_folder, f))] + +number = 0 +while ('tokens_'+str(number)+'.txt') in token_files and ('bookkeeping_proj_'+str(number)+'.txt') in book_proj_files and ('bookkeeping_file_'+str(number)+'.txt') in book_files_files: + with open(os.path.join(PATH_tokens_folder,'tokens_'+str(number)+'.txt'),'r') as tokens, open(os.path.join(PATH_bookkeeping_proj_folder,'bookkeeping_proj_'+str(number)+'.txt'),'r') as projects, open(os.path.join(PATH_bookkeeping_file_folder,'bookkeeping_file_'+str(number)+'.txt'),'r') as files: + + print 'Reading ','bookkeeping_file_'+str(number)+'.txt' + files_dict = {} + for file in files: + if len(file) > 2: + file = file.strip('\n').split(',') + # Derive file name from file path and map it to (proj_id, file_id) + files_dict[(file[0], file[1])] = file[2] + + print 'Reading ','bookkeeping_proj_'+str(number)+'.txt' + projs_dict = {} + for project in projects: + if len(project) > 2: + project = project.strip('\n').split(',') + # Get project path name + projs_dict[project[0]] = project[1] + # Create folder if does not exist + if not os.path.exists(os.path.join(PATH_TARGET,project[1])): + os.makedirs(os.path.join(PATH_TARGET,project[1])) + + print 'Reading ','tokens_'+str(number)+'.txt' + for token in tokens: + # Proceed if token file is not empty + if len(token) > 2: + # Write the mirror to corresponding file + with open(os.path.join(PATH_TARGET,projs_dict[token.split(',')[0]])+'/tokens.txt','a') as tokens_file: + file_path = files_dict[(token.split(',')[0],token.split(',')[1])].split(projs_dict[token.split(',')[0]])[1][1:] + # print file_path + tokens_file.write(file_path+'@#@'+token.split('@#@')[1]) + + number +=1 diff --git a/tokenizers/all-file-level/tokenizer.py b/tokenizers/all-file-level/tokenizer.py old mode 100755 new mode 100644 index 41c94aca6..89538ee1e --- a/tokenizers/all-file-level/tokenizer.py +++ b/tokenizers/all-file-level/tokenizer.py @@ -1 +1,19 @@ -import logging import multiprocessing as mp from multiprocessing import Process import re import os import collections from lockfile import LockFile # Provided by the user PATH_proj_paths = 'projects.txt' N_PROCESSES = 4 separators = ['::','.','->','[',']','(',')','++','--','~','!','-','+','&','*','.*','->*','*','/','%','<<','>>','<','>','<=','>=','++','!=','&','^','|','&&','||','?','==',';','{','}','=','#',','] file_extensions = ['cpp','hpp','c','h'] comment_end_of_line = '//' comment_open_tag = re.escape('/*') comment_close_tag = re.escape('*/') # folders PATH_tokens_folder = 'tokens' PATH_bookkeeping_file_folder = 'bookkeeping_files' PATH_bookkeeping_proj_folder = 'bookkeeping_projs' PATH_projects_success = 'projects_success.txt' PATH_project_starting_index = 'project_starting_index.txt' PATH_projects_fail = 'projects_fail.txt' # Logging code FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s' logging.basicConfig(level=logging.DEBUG,format=FORMAT) file_handler = logging.FileHandler('results.log') file_handler.setFormatter(logging.Formatter(FORMAT)) logging.getLogger().addHandler(file_handler) def tokenizer(proj_id, proj_path, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj): if not os.path.exists(proj_path): logging.error('Project not found <'+proj_id+','+proj_path+'>') # Important to have a global loc on this file because it is shared lock = LockFile(PATH_projects_fail) with lock: with open(PATH_projects_fail,'a+') as project_fail: project_fail.write(proj_path+'\n') return # In case process names need to be logged # process_name = '['+mp.current_process().name+'] ' # logging.info(process_name+'Starting proj <'+proj_id+','+proj_path+'>') all_files = [] for (dirpath, dirnames, filenames) in os.walk(proj_path): aux = [] for extension in file_extensions: aux = [x for x in filenames if x.endswith(extension)] aux = [os.path.join(dirpath,x) for x in aux] all_files.extend(aux) all_files = zip(range(0,len(all_files)),all_files) for file_id, file_path in all_files: logging.info('Starting file <'+proj_id+','+str(file_id)+','+file_path+'>') try: with open(file_path,'r') as myfile: file_string = myfile.read() except IOError: logging.error('File not found <'+proj_id+','+str(file_id)+','+file_path+'>') continue # Remove enf of line comments file_string = re.sub(comment_end_of_line+'.*?\n','',file_string,flags=re.DOTALL) # Remove tagged comments file_string = re.sub(comment_open_tag+'.*?'+comment_close_tag,'',file_string,flags=re.DOTALL) #Transform separators into spaces (remove them) for x in separators: file_string = file_string.replace(x,' ') #Create a list of tokens file_string = file_string.split() #Count occurrences file_string = collections.Counter(file_string) #Converting Counter to dict because according to StackOverflow is better file_string=dict(file_string) tokens = [] #SourcererCC formatting for k, v in file_string.items(): tokens.append(k+'@@::@@'+str(v)) tokens = ','.join(tokens) FILE_bookkeeping_file.write(proj_id+','+str(file_id)+','+file_path+'\n') FILE_tokens.write(proj_id+','+str(file_id)+'@#@'+tokens+'\n') FILE_bookkeeping_proj.write(proj_id+','+proj_path+'\n') # Important to have a global loc on this file because it is shared lock = LockFile(PATH_projects_success) with lock: with open(PATH_projects_success,'a+') as project_success: project_success.write(proj_path+'\n') def tokenize(list_projects, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name): # Each tokenize will represent a new process with open(FILE_tokens_name, 'w') as FILE_tokens, \ open(FILE_bookkeeping_file_name, 'w') as FILE_bookkeeping_file, \ open(FILE_bookkeeping_proj_name, 'w') as FILE_bookkeeping_proj: for proj_id, proj_path in list_projects: tokenizer(str(proj_id), proj_path, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj) if __name__ == '__main__': #In the main file we: # create directories if they do not exist # read list of PATH_projects_success, if exists, and do not process these again # each process needs a unique file with tokens and file and project # bookkeeping in the proper folders # start N_PROCESSES, and give them [(unique_id, proj_path)] if not os.path.exists(PATH_tokens_folder): os.makedirs(PATH_tokens_folder) if not os.path.exists(PATH_bookkeeping_file_folder): os.makedirs(PATH_bookkeeping_file_folder) if not os.path.exists(PATH_bookkeeping_proj_folder): os.makedirs(PATH_bookkeeping_proj_folder) proj_paths = [] with open(PATH_proj_paths) as f: for line in f: proj_paths.append(line.strip('\n')) projects_success = [] try: with open(PATH_projects_success,'r') as f: for line in f: projects_success.append(line.strip().strip('\n')) except IOError as e: logging.info('File '+PATH_projects_success+' no found') projects_starting_index = 0 proj_paths = list(set(proj_paths) - set(projects_success)) # Initialize projects_starting_index with previous logged number if not os.path.exists(PATH_project_starting_index): with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(len(proj_paths))+'\n') else: try: with open(PATH_project_starting_index, 'r') as FILE_project_starting_index: projects_starting_index = int(FILE_project_starting_index.readline().strip('\n')) except ValueError: projects_starting_index = 0 with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(projects_starting_index+len(proj_paths))+'\n') proj_paths = zip(range(projects_starting_index, len(proj_paths)+projects_starting_index),proj_paths) #Split list of projects into N_PROCESSES lists proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ] # Multiprocessing with N_PROCESSES processes = [] process_num = 0 n =0 for input_process in proj_paths_list: # Skip empty sublists if len(input_process) == 0: continue process_num += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' while (os.path.isfile(FILE_tokens_name) and os.path.isfile(FILE_bookkeeping_file_name) and os.path.isfile(FILE_bookkeeping_proj_name)): n += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' n += 1 processes.append(Process(name='Process '+str(process_num), target=tokenize, args=(input_process, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name,))) for proc in processes: proc.start() logging.info(proc.name) for proc in processes: proc.join() \ No newline at end of file +import sys +import subprocess + +arg = '' +if len(sys.argv) > 1: + arg = sys.argv[1] + +functions = ['folder', 'tar', 'mirror'] + +if arg == functions[0]: + subprocess.call("python src/tokenizer-directory.py", shell=True) +elif arg == functions[1]: + subprocess.call("python src/tokenizer-tar.py", shell=True) +elif arg == functions[2]: + subprocess.call("python src/tokens-mirror.py", shell=True) +else: + # print 'No argument specified' + print '\nPossible arguments are:\t' + ', '.join(functions) + '\n' + sys.exit()