@@ -168,34 +168,46 @@ def main(file_name, clean_src_file):
168168 done_set .update (r .read ().split ("\n " ))
169169
170170 done_num = 0
171+ new_add = 0
171172 with open (filename , "r" , encoding = "utf-8" )as reader :
172173 file_data = reader .readlines ()
174+ file_length = len (file_data )
173175 for idx ,line in enumerate (file_data ):
174176 rid , addr = line .strip ().split ("," , 1 )
175177 if rid in done_set :
176178 done_num += 1
177179 continue
178- if done_num >= 0 :
179- print (f"{ done_num } repos was already done. PASS." )
180- done_num = - 1
181- print ("\n " + "↓" * 20 + f" { tm ()} { rid } start " + "↓" * 20 )
180+ # if done_num >= 0:
181+ # print(f"{done_num} repos was already done. PASS.")
182+ # done_num = -1
183+ print ("\n " + "↓" * 20 + f" { tm ()} { rid } { idx + 1 } / { file_length } start " + "↓" * 20 )
182184 final = False
183185 if idx == len (file_data ): final = True
184186 # 需要获取converter返回的新的chunk_counter,否则这里不知道在写入jsonl的时候counter是否有增加
185187 chunk_counter , err = parse_one_line (line , fastest_ip , clean_src_file , output_folder = output_folder , chunk_counter = chunk_counter , final = final )
186188 if err is True :
187- print ("↑" * 20 + f" { tm ()} { rid } ERROR " + "↑" * 21 )
189+ print ("↑" * 20 + f" { tm ()} { rid } ERROR " + "↑" * 31 )
188190 else :
191+ new_add += 1
189192 with open (done_file , "a" , encoding = 'utf-8' )as a :
190193 a .write (rid + "\n " )
191- print ("↑" * 20 + f" { tm ()} { rid } done " + "↑" * 21 )
194+ print ("↑" * 20 + f" { tm ()} { rid } done " + "↑" * 31 )
192195 done_set .add (rid )
193196
197+ return file_length , done_num , new_add
198+
194199if __name__ == '__main__' :
195200
196201 filename = "repos_list.txt"
197202 clean_src_file = True # 最终是否是删除zip文件只保留jsonl
198203
199- main (file_name = filename , clean_src_file = clean_src_file )
204+ file_length , done_num , new_add = main (file_name = filename , clean_src_file = clean_src_file )
200205
201206 print (f"ALL DONE AT " + tm ())
207+ print (f"爬取情况如下:" )
208+ print (f"\t 共 { file_length } 个待爬仓库" )
209+ print (f"\t 已爬取 { done_num + new_add } 个仓库" )
210+ print (f"\t 本次新增 { new_add } 个" )
211+ print ("爬取过程中可能会有各种原因导致一些仓库爬取失败,您可以隔一段时间(防止是网络原因导致)再次重启该爬虫。" )
212+ print ("如果数次重启新增仓库数量都为 0,则表明爬取已经结束,您可以压缩并提交您最终的 output 文件。" )
213+ print ("感谢您为中国 AI 发展做出的贡献!!!" )
0 commit comments