Skip to content

Commit 2d85b7f

Browse files
authored
Merge pull request #21 from washing1127/main
add a determination of last repo
2 parents 5fec50b + 04d4ae4 commit 2d85b7f

File tree

2 files changed

+10
-5
lines changed

2 files changed

+10
-5
lines changed

converter.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,11 +221,13 @@ def return_counter(self):
221221
def get_jsonl_file(self):
222222
return self.output / f"githubcode.{self.chunk_counter}.jsonl"
223223

224-
def __call__(self, zip_path):
224+
def __call__(self, zip_path, final=False):
225225
zip_path = Path(zip_path)
226226
self.temp_name = self.output / ("tempFile_" + zip_path.stem) # 本仓库的临时jsonl文件
227227
if os.path.exists(self.temp_name): os.unlink(self.temp_name)
228228
assert zip_path.exists(), FileNotFoundError(str(zip_path))
229229
self.get_zipfile(zip_path)
230230
if self.clean_src_file is True:
231231
zip_path.unlink()
232+
if final is True and os.path.exists(self.get_jsonl_file()): # 最后一个仓库解析完打成压缩包
233+
self.create_zip(self.get_jsonl_file())

run.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def down(fastest_ip, url, final_path):
9191
shutil.move(target_path, final_path)
9292
print(f"{tm()} Moved downloading file to zip file.")
9393

94-
def parse_one_line(line, fastest_ip, clean_src_file, output_folder, chunk_counter):
94+
def parse_one_line(line, fastest_ip, clean_src_file, output_folder, chunk_counter, final=False):
9595
rid, addr = line.strip().split(",", 1)
9696
addr = addr.strip()
9797
if len(rid) < 3: rid = rid.zfill(3)
@@ -127,7 +127,7 @@ def parse_one_line(line, fastest_ip, clean_src_file, output_folder, chunk_counte
127127
# 提取代码语料到jsonl
128128
print(f"{tm()} Generating jsonl files.", end=" ", flush=True)
129129
handler = Zipfile2JsonL(output_folder, target_encoding="utf-8", clean_src_file=clean_src_file, plateform="github", author=author, chunk_counter=chunk_counter)
130-
handler(final_path) # final_path: str, 最后的zip文件
130+
handler(final_path, final=final) # final_path: str, 最后的zip文件; final: bool, 是否是repos_list的最后一行(即最后一个仓库)
131131
chunk_counter = handler.return_counter()
132132
print(f"DONE! {tm()}")
133133
return chunk_counter
@@ -167,7 +167,8 @@ def main(file_name, clean_src_file):
167167

168168
done_num = 0
169169
with open(filename, "r", encoding="utf-8")as reader:
170-
for line in reader:
170+
file_data = reader.readlines()
171+
for idx,line in enumerate(file_data):
171172
rid, addr = line.strip().split(",", 1)
172173
if rid in done_set:
173174
done_num += 1
@@ -176,8 +177,10 @@ def main(file_name, clean_src_file):
176177
print(f"{done_num} repos was already done. PASS.")
177178
done_num = -1
178179
print("\n"+"↓"*20 + f" {tm()} {rid} start " + "↓" * 20)
180+
final = False
181+
if idx == len(file_data): final = True
179182
# 需要获取converter返回的新的chunk_counter,否则这里不知道在写入jsonl的时候counter是否有增加
180-
chunk_counter = parse_one_line(line, fastest_ip, clean_src_file, output_folder=output_folder, chunk_counter=chunk_counter)
183+
chunk_counter = parse_one_line(line, fastest_ip, clean_src_file, output_folder=output_folder, chunk_counter=chunk_counter, final=final)
181184
with open("./.done", "a", encoding='utf-8')as a:
182185
a.write(rid+"\n")
183186
print("↑"*20 + f" {tm()} {rid} done " + "↑" * 21)

0 commit comments

Comments
 (0)