@@ -91,7 +91,7 @@ def down(fastest_ip, url, final_path):
9191 shutil .move (target_path , final_path )
9292 print (f"{ tm ()} Moved downloading file to zip file." )
9393
94- def parse_one_line (line , fastest_ip , clean_src_file , output_folder , chunk_counter ):
94+ def parse_one_line (line , fastest_ip , clean_src_file , output_folder , chunk_counter , final = False ):
9595 rid , addr = line .strip ().split ("," , 1 )
9696 addr = addr .strip ()
9797 if len (rid ) < 3 : rid = rid .zfill (3 )
@@ -127,7 +127,7 @@ def parse_one_line(line, fastest_ip, clean_src_file, output_folder, chunk_counte
127127 # 提取代码语料到jsonl
128128 print (f"{ tm ()} Generating jsonl files." , end = " " , flush = True )
129129 handler = Zipfile2JsonL (output_folder , target_encoding = "utf-8" , clean_src_file = clean_src_file , plateform = "github" , author = author , chunk_counter = chunk_counter )
130- handler (final_path ) # final_path: str, 最后的zip文件
130+ handler (final_path , final = final ) # final_path: str, 最后的zip文件; final: bool, 是否是repos_list的最后一行(即最后一个仓库)
131131 chunk_counter = handler .return_counter ()
132132 print (f"DONE! { tm ()} " )
133133 return chunk_counter
@@ -167,7 +167,8 @@ def main(file_name, clean_src_file):
167167
168168 done_num = 0
169169 with open (filename , "r" , encoding = "utf-8" )as reader :
170- for line in reader :
170+ file_data = reader .readlines ()
171+ for idx ,line in enumerate (file_data ):
171172 rid , addr = line .strip ().split ("," , 1 )
172173 if rid in done_set :
173174 done_num += 1
@@ -176,8 +177,10 @@ def main(file_name, clean_src_file):
176177 print (f"{ done_num } repos was already done. PASS." )
177178 done_num = - 1
178179 print ("\n " + "↓" * 20 + f" { tm ()} { rid } start " + "↓" * 20 )
180+ final = False
181+ if idx == len (file_data ): final = True
179182 # 需要获取converter返回的新的chunk_counter,否则这里不知道在写入jsonl的时候counter是否有增加
180- chunk_counter = parse_one_line (line , fastest_ip , clean_src_file , output_folder = output_folder , chunk_counter = chunk_counter )
183+ chunk_counter = parse_one_line (line , fastest_ip , clean_src_file , output_folder = output_folder , chunk_counter = chunk_counter , final = final )
181184 with open ("./.done" , "a" , encoding = 'utf-8' )as a :
182185 a .write (rid + "\n " )
183186 print ("↑" * 20 + f" { tm ()} { rid } done " + "↑" * 21 )
0 commit comments