SourcererCC/tokenizers/all-file-level/src/tokenizer-muse.py at 0b0f8d447ed29fb4888ebc5156ec74d9e445fa3b · Mondego/SourcererCC · GitHub

1
import loggingimport multiprocessing as mpfrom multiprocessing import Processimport reimport osimport collectionsfrom lockfile import LockFileimport tarfileimport mimetypes# Provided by the userPATH_proj_paths = 'c_and_c++_projs.txt'N_PROCESSES = 30separators = ['::','.','->','[',']','(',')','++','--','~','!','-','+','&','*','.*','->*','*','/','%','<<','>>','<','>','<=','>=','++','!=','&','^','|','&&','||','?','==',';','{','}','=','#',',','"','\\',':','$']ALWAYS = ['@','@#@','@@::@@','#'] # These should be always part of the separatorsseparators.extend(ALWAYS)file_extensions = ['.cpp','.hpp','.c','.h','.C','.cc','.CPP','.c++','.cp']comment_end_of_line = '//'comment_open_tag = re.escape('/*')comment_close_tag = re.escape('*/')# foldersPATH_tokens_folder = 'tokens'PATH_bookkeeping_file_folder = 'bookkeeping_files'PATH_bookkeeping_proj_folder = 'bookkeeping_projs'PATH_projects_success = 'projects_success.txt'PATH_project_starting_index = 'project_starting_index.txt'PATH_projects_fail = 'projects_fail.txt'# Logging codeFORMAT = '[%(levelname)s] (%(threadName)s) %(message)s'logging.basicConfig(level=logging.DEBUG,format=FORMAT)file_handler = logging.FileHandler('results.log')file_handler.setFormatter(logging.Formatter(FORMAT))logging.getLogger().addHandler(file_handler)# Some of the files we found happen to be binary, even if we their extension is something# line *.cpp. Therefore we explore a behavior of file(1) to find if these files are binary# http://stackoverflow.com/questions/32184809/python-file1-why-are-the-numbers-7-8-9-10-12-13-27-and-range0x20-0x100textchars = bytearray({7,8,9,10,12,13,27} | set(range(0x20, 0x100)) - {0x7f})is_binary_string = lambda bytes: bool(bytes.translate(None, textchars))def tokenizer(proj_id, proj_path, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name):	logging.info('Starting project <'+proj_id+','+proj_path+'>')	if not os.path.isdir(proj_path):		logging.error('Unable to open project <'+proj_id+','+proj_path+'>')		lock = LockFile(PATH_projects_fail)		with lock:			with open(PATH_projects_fail,'a+') as project_failure:				project_failure.write(proj_path+'\n')		return	# Search for all tar files	tar_files = [os.path.join(proj_path, f) for f in os.listdir(proj_path) if os.path.isfile(os.path.join(proj_path, f))]	tar_files = [f for f in tar_files if '_code' in f]	if(len(tar_files) != 1):		logging.error('Tar not found on <'+proj_id+','+proj_path+'>')		# Important to have a global loc on this file because it is shared		lock = LockFile(PATH_projects_fail)		with lock:			with open(PATH_projects_fail,'a+') as project_fail:				project_fail.write(proj_path+'\n')		return	tar_file = tar_files[0]		try:		with tarfile.open(tar_file,'r') as my_tar_file:			# Get all members on the tar file			all_files = []			for member in my_tar_file.getmembers():				all_files.append(member.name)					# Filter them by the correct extension			aux = []			for extension in file_extensions:				aux.extend([x for x in all_files if x.endswith(extension)])			all_files = aux			# This is very strange, but I did find some paths with newlines,			# so I am simply eliminatins these			all_files = [x for x in all_files if '\n' not in x]			# In case process names need to be logged			# process_name = '['+mp.current_process().name+'] '						all_files = zip(range(0,len(all_files)),all_files)					for file_id, file_path in all_files:				logging.info('Starting file <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>')						try:					myfile = my_tar_file.extractfile(file_path)				except:					logging.error('Unable to open file (1) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>')					break				if myfile is None:					logging.error('Unable to open file (2) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>')					break				file_string = myfile.read()				if is_binary_string(file_string):					logging.error('Unable to open file (3) <'+proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'>')					break				# Remove enf of line comments				file_string = re.sub(comment_end_of_line+'.*?\n','',file_string,flags=re.DOTALL)				# Remove tagged comments				file_string = re.sub(comment_open_tag+'.*?'+comment_close_tag,'',file_string,flags=re.DOTALL)				#Transform separators into spaces (remove them)				for x in separators:					file_string = file_string.replace(x,' ')				#Create a list of tokens				file_string = file_string.split()				#Count occurrences				file_string = collections.Counter(file_string)				#Converting Counter to dict because according to StackOverflow is better				file_string=dict(file_string)								tokens = []				#SourcererCC formatting				for k, v in file_string.items():					tokens.append(k+'@@::@@'+str(v))				tokens = ','.join(tokens)				with open(FILE_tokens_name, 'a+') as FILE_tokens_file:					FILE_tokens_file.write(proj_id+','+str(file_id)+'@#@'+tokens+'\n')				with open(FILE_bookkeeping_file_name, 'a+') as FILE_bookkeeping_file:					FILE_bookkeeping_file.write(proj_id+','+str(file_id)+','+os.path.join(tar_file,file_path)+'\n')		except Exception:		logging.error('Unable to open tar on <'+proj_id+','+proj_path+'>')		lock = LockFile(PATH_projects_fail)		with lock:			with open(PATH_projects_fail,'a+') as project_failure:				project_failure.write(proj_path+'\n')		return	with open(FILE_bookkeeping_proj_name, 'a+') as FILE_bookkeeping_proj:		FILE_bookkeeping_proj.write(proj_id+','+proj_path+'\n')	# Important to have a global loc on this file because it is shared	lock = LockFile(PATH_projects_success)	with lock:		with open(PATH_projects_success,'a+') as project_success:			project_success.write(proj_path+'\n')	logging.info('Project finished <'+proj_id+','+proj_path+'>')def tokenize(list_projects, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name):	# Each tokenize will represent a new process	for proj_id, proj_path in list_projects:		tokenizer(str(proj_id), proj_path, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name)if __name__ == '__main__':	#In the main file we:	#	create directories if they do not exist	#	read list of PATH_projects_success, if exists, and do not process these again	# 	each process needs a unique file with tokens and file and project	# 		bookkeeping in the proper folders	#	start N_PROCESSES, and give them [(unique_id, proj_path)]	if not os.path.exists(PATH_tokens_folder):		os.makedirs(PATH_tokens_folder)	if not os.path.exists(PATH_bookkeeping_file_folder):		os.makedirs(PATH_bookkeeping_file_folder)	if not os.path.exists(PATH_bookkeeping_proj_folder):		os.makedirs(PATH_bookkeeping_proj_folder)	proj_paths = []	with open(PATH_proj_paths) as f:		for line in f:			proj_paths.append(line.strip('\n'))		projects_success = []	try:		with open(PATH_projects_success,'r') as f:			for line in f:				projects_success.append(line.strip().strip('\n'))	except IOError as e:		logging.info('File '+PATH_projects_success+' no found')	projects_starting_index = 0	proj_paths = list(set(proj_paths) - set(projects_success))	# Initialize projects_starting_index with previous logged number	if not os.path.exists(PATH_project_starting_index):		with open(PATH_project_starting_index, 'w') as FILE_project_starting_index:			FILE_project_starting_index.write(str(len(proj_paths))+'\n')	else:		try:			with open(PATH_project_starting_index, 'r') as FILE_project_starting_index:				projects_starting_index = int(FILE_project_starting_index.readline().strip('\n'))		except ValueError:			projects_starting_index = 0		with open(PATH_project_starting_index, 'w') as FILE_project_starting_index:			FILE_project_starting_index.write(str(projects_starting_index+len(proj_paths))+'\n')	proj_paths = zip(range(projects_starting_index, len(proj_paths)+projects_starting_index),proj_paths)		#Split list of projects into N_PROCESSES lists	proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ]	# Multiprocessing with N_PROCESSES	processes = []	process_num = 0	n =0	for input_process in proj_paths_list:		# Skip empty sublists		if len(input_process) == 0:			continue		process_num += 1		FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt'		FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt'		FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt'		while (os.path.isfile(FILE_tokens_name) and os.path.isfile(FILE_bookkeeping_file_name) and os.path.isfile(FILE_bookkeeping_proj_name)):			n += 1			FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt'			FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt'			FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt'		n += 1		processes.append(Process(name='Process '+str(process_num), target=tokenize, args=(input_process, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name,)))	for proc in processes:		proc.start()		logging.info(proc.name)	for proc in processes:		proc.join()