-
Notifications
You must be signed in to change notification settings - Fork 77
Expand file tree
/
Copy pathtokenizer-muse.py
More file actions
executable file
·249 lines (195 loc) · 9.08 KB
/
tokenizer-muse.py
File metadata and controls
executable file
·249 lines (195 loc) · 9.08 KB
1
import loggingimport multiprocessing as mpfrom multiprocessing import Processimport reimport osimport collectionsfrom lockfile import LockFileimport tarfileimport mimetypes# Provided by the userPATH_proj_paths = 'c_and_c++_projs.txt'N_PROCESSES = 30separators = ['::','.','->','[',']','(',')','++','--','~','!','-','+','&','*','.*','->*','*','/','%','<<','>>','<','>','<=','>=','++','!=','&','^','|','&&','||','?','==',';','{','}','=','#',',','"','\\',':','$']ALWAYS = ['@','@#@','@@::@@','#'] # These should be always part of the separatorsseparators.extend(ALWAYS)file_extensions = ['.cpp','.hpp','.c','.h','.C','.cc','.CPP','.c++','.cp']comment_end_of_line = '//'comment_open_tag = re.escape('/*')comment_close_tag = re.escape('*/')# foldersPATH_tokens_folder = 'tokens'PATH_bookkeeping_file_folder = 'bookkeeping_files'PATH_bookkeeping_proj_folder = 'bookkeeping_projs'PATH_projects_success = 'projects_success.txt'PATH_project_starting_index = 'project_starting_index.txt'PATH_projects_fail = 'projects_fail.txt'# Logging codeFORMAT = '[%(levelname)s] (%(threadName)s) %(message)s'logging.basicConfig(level=logging.DEBUG,format=FORMAT)file_handler = logging.FileHandler('results.log')file_handler.setFormatter(logging.Formatter(FORMAT))logging.getLogger().addHandler(file_handler)# Some of the files we found happen to be binary, even if we their extension is something# line *.cpp. Therefore we explore a behavior of file(1) to find if these files are binary# http://stackoverflow.com/questions/32184809/python-file1-why-are-the-numbers-7-8-9-10-12-13-27-and-range0x20-0x100textchars = bytearray({7,8,9,10,12,13,27} | set(range(0x20, 0x100)) - {0x7f})is_binary_string = lambda bytes: bool(bytes.translate(None, textchars))def tokenizer(proj_id, proj_path, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name): logging.info('Starting project <'+proj_id+','+proj_path+'>') if not os.path.isdir(proj_path): logging.error('Unable to open project <'+proj_id+','+proj_path+'>') lock = LockFile(PATH_projects_fail) with lock: with open(PATH_projects_fail,'a+') as project_failure: project_failure.write(proj_path+'\n') return # Search for all tar files tar_files = [os.path.join(proj_path, f) for f in os.listdir(proj_path) if os.path.isfile(os.path.join(proj_path, f))] tar_files = [f for f in tar_files if '_code' in f] if(len(tar_files) != 1): logging.error('Tar not found on <'+proj_id+','+proj_path+'>') # Important to have a global loc on this file because it is shared lock = LockFile(PATH_projects_fail) with lock: with open(PATH_projects_fail,'a+') as project_fail: project_fail.write(proj_path+'\n') return tar_file = tar_files[0] try: with tarfile.open(tar_file,'r') as my_tar_file: # Get all members on the tar file all_files = [] for member in my_tar_file.getmembers(): all_files.append(member.name) # Filter them by the correct extension aux = [] for extension in file_extensions: aux.extend([x for x in all_files if x.endswith(extension)]) all_files = aux # This is very strange, but I did find some paths with newlines, # so I am simply eliminatins these all_files = [x for x in all_files if '\n' not in x] # In case process names need to be logged # process_name = '['+mp.current_process().name+'] ' all_files = zip(range(0,len(all_files)),all_files) for file_id, file_path in all_files: logging.info('Starting file <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>') try: myfile = my_tar_file.extractfile(file_path) except: logging.error('Unable to open file (1) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>') break if myfile is None: logging.error('Unable to open file (2) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>') break file_string = myfile.read() if is_binary_string(file_string): logging.error('Unable to open file (3) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>') break # Remove enf of line comments file_string = re.sub(comment_end_of_line+'.*?\n','',file_string,flags=re.DOTALL) # Remove tagged comments file_string = re.sub(comment_open_tag+'.*?'+comment_close_tag,'',file_string,flags=re.DOTALL) #Transform separators into spaces (remove them) for x in separators: file_string = file_string.replace(x,' ') #Create a list of tokens file_string = file_string.split() #Count occurrences file_string = collections.Counter(file_string) #Converting Counter to dict because according to StackOverflow is better file_string=dict(file_string) tokens = [] #SourcererCC formatting for k, v in file_string.items(): tokens.append(k+'@@::@@'+str(v)) tokens = ','.join(tokens) with open(FILE_tokens_name, 'a+') as FILE_tokens_file: FILE_tokens_file.write(proj_id+','+str(file_id)+'@#@'+tokens+'\n') with open(FILE_bookkeeping_file_name, 'a+') as FILE_bookkeeping_file: FILE_bookkeeping_file.write(proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'\n') except Exception: logging.error('Unable to open tar on <'+proj_id+','+proj_path+'>') lock = LockFile(PATH_projects_fail) with lock: with open(PATH_projects_fail,'a+') as project_failure: project_failure.write(proj_path+'\n') return with open(FILE_bookkeeping_proj_name, 'a+') as FILE_bookkeeping_proj: FILE_bookkeeping_proj.write(proj_id+','+proj_path+'\n') # Important to have a global loc on this file because it is shared lock = LockFile(PATH_projects_success) with lock: with open(PATH_projects_success,'a+') as project_success: project_success.write(proj_path+'\n') logging.info('Project finished <'+proj_id+','+proj_path+'>')def tokenize(list_projects, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name): # Each tokenize will represent a new process for proj_id, proj_path in list_projects: tokenizer(str(proj_id), proj_path, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name)if __name__ == '__main__': #In the main file we: # create directories if they do not exist # read list of PATH_projects_success, if exists, and do not process these again # each process needs a unique file with tokens and file and project # bookkeeping in the proper folders # start N_PROCESSES, and give them [(unique_id, proj_path)] if not os.path.exists(PATH_tokens_folder): os.makedirs(PATH_tokens_folder) if not os.path.exists(PATH_bookkeeping_file_folder): os.makedirs(PATH_bookkeeping_file_folder) if not os.path.exists(PATH_bookkeeping_proj_folder): os.makedirs(PATH_bookkeeping_proj_folder) proj_paths = [] with open(PATH_proj_paths) as f: for line in f: proj_paths.append(line.strip('\n')) projects_success = [] try: with open(PATH_projects_success,'r') as f: for line in f: projects_success.append(line.strip().strip('\n')) except IOError as e: logging.info('File '+PATH_projects_success+' no found') projects_starting_index = 0 proj_paths = list(set(proj_paths) - set(projects_success)) # Initialize projects_starting_index with previous logged number if not os.path.exists(PATH_project_starting_index): with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(len(proj_paths))+'\n') else: try: with open(PATH_project_starting_index, 'r') as FILE_project_starting_index: projects_starting_index = int(FILE_project_starting_index.readline().strip('\n')) except ValueError: projects_starting_index = 0 with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(projects_starting_index+len(proj_paths))+'\n') proj_paths = zip(range(projects_starting_index, len(proj_paths)+projects_starting_index),proj_paths) #Split list of projects into N_PROCESSES lists proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ] # Multiprocessing with N_PROCESSES processes = [] process_num = 0 n =0 for input_process in proj_paths_list: # Skip empty sublists if len(input_process) == 0: continue process_num += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' while (os.path.isfile(FILE_tokens_name) and os.path.isfile(FILE_bookkeeping_file_name) and os.path.isfile(FILE_bookkeeping_proj_name)): n += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' n += 1 processes.append(Process(name='Process '+str(process_num), target=tokenize, args=(input_process, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name,))) for proc in processes: proc.start() logging.info(proc.name) for proc in processes: proc.join()