Skip to content

Commit 6351395

Browse files
authored
Merge pull request #23 from washing1127/main
add end hint
2 parents 9064f59 + 8349721 commit 6351395

File tree

1 file changed

+19
-7
lines changed

1 file changed

+19
-7
lines changed

run.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -168,34 +168,46 @@ def main(file_name, clean_src_file):
168168
done_set.update(r.read().split("\n"))
169169

170170
done_num = 0
171+
new_add = 0
171172
with open(filename, "r", encoding="utf-8")as reader:
172173
file_data = reader.readlines()
174+
file_length = len(file_data)
173175
for idx,line in enumerate(file_data):
174176
rid, addr = line.strip().split(",", 1)
175177
if rid in done_set:
176178
done_num += 1
177179
continue
178-
if done_num >= 0:
179-
print(f"{done_num} repos was already done. PASS.")
180-
done_num = -1
181-
print("\n"+"↓"*20 + f" {tm()} {rid} start " + "↓" * 20)
180+
# if done_num >= 0:
181+
# print(f"{done_num} repos was already done. PASS.")
182+
# done_num = -1
183+
print("\n"+"↓"*20 + f" {tm()} {rid} {idx+1}/{file_length} start " + "↓" * 20)
182184
final = False
183185
if idx == len(file_data): final = True
184186
# 需要获取converter返回的新的chunk_counter,否则这里不知道在写入jsonl的时候counter是否有增加
185187
chunk_counter, err = parse_one_line(line, fastest_ip, clean_src_file, output_folder=output_folder, chunk_counter=chunk_counter, final=final)
186188
if err is True:
187-
print("↑"*20 + f" {tm()} {rid} ERROR " + "↑" * 21)
189+
print("↑"*20 + f" {tm()} {rid} ERROR " + "↑" * 31)
188190
else:
191+
new_add += 1
189192
with open(done_file, "a", encoding='utf-8')as a:
190193
a.write(rid+"\n")
191-
print("↑"*20 + f" {tm()} {rid} done " + "↑" * 21)
194+
print("↑"*20 + f" {tm()} {rid} done " + "↑" * 31)
192195
done_set.add(rid)
193196

197+
return file_length, done_num, new_add
198+
194199
if __name__ == '__main__':
195200

196201
filename = "repos_list.txt"
197202
clean_src_file = True # 最终是否是删除zip文件只保留jsonl
198203

199-
main(file_name=filename, clean_src_file=clean_src_file)
204+
file_length, done_num, new_add = main(file_name=filename, clean_src_file=clean_src_file)
200205

201206
print(f"ALL DONE AT " + tm())
207+
print(f"爬取情况如下:")
208+
print(f"\t{file_length} 个待爬仓库")
209+
print(f"\t已爬取 {done_num+new_add} 个仓库")
210+
print(f"\t本次新增 {new_add} 个")
211+
print("爬取过程中可能会有各种原因导致一些仓库爬取失败,您可以隔一段时间(防止是网络原因导致)再次重启该爬虫。")
212+
print("如果数次重启新增仓库数量都为 0,则表明爬取已经结束,您可以压缩并提交您最终的 output 文件。")
213+
print("感谢您为中国 AI 发展做出的贡献!!!")

0 commit comments

Comments
 (0)