-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSubExtractor.py
More file actions
102 lines (93 loc) · 3.1 KB
/
SubExtractor.py
File metadata and controls
102 lines (93 loc) · 3.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import sys
import time
import numpy
import zhconv
import difflib
import paddleocr
import subprocess
import multiprocessing
def sec_to_hms(s:float)->str:
h,s = divmod(s,3600)
m,s = divmod(s,60)
s,ms = divmod(s,1)
hms = f'{int(h):0>2}:{int(m):0>2}:{int(s):0>2}.{int(ms//0.001):0>3}'
return hms
def frame_pipe(path:str)->subprocess.Popen:
cmd = ['ffmpeg.cmd','-i',path,'-c:v','rawvideo','-pix_fmt','gray','-f','image2pipe','-']
rawvideo = subprocess.Popen(cmd,stdout=subprocess.PIPE)
return rawvideo
def cut_frame(frame:bytes)->numpy.ndarray:
frame = numpy.frombuffer(frame,numpy.uint8)
frame = frame.reshape((1080,1920))
frame = numpy.vstack((frame[:216],frame[864:]))
return frame
def ocr_engine()->paddleocr.PaddleOCR:
rec_model_dir = 'D:/Portable/Bin/Scripts/Models/rec'
det_model_dir = 'D:/Portable/Bin/Scripts/Models/det'
cls_model_dir = 'D:/Portable/Bin/Scripts/Models/cls'
Ocr = paddleocr.PaddleOCR(rec_model_dir=rec_model_dir,det_model_dir=det_model_dir,cls_model_dir=cls_model_dir,show_log=False)
return Ocr
def trans_char(con:str)->str:
con = con.upper()
con = con.replace('“','"')
con = con.replace('”','"')
con = con.replace('‘',"'")
con = con.replace('’',"'")
con = zhconv.convert(con,'zh-cn')
return con
def rec_char(line:multiprocessing.Queue,res:list)->None:
Ocr = ocr_engine()
while (item:=line.get()):
sub = []
if (items:=Ocr.ocr(item['con'],cls=False)[0]):
for i in items:
sub.append(i[-1][0])
else:
sub.append('\n')
item['con'] = '\n'.join(sub)
res.append(item)
def get_vtt(res:list)->str:
res.sort(key=lambda x:x['num'])
for i in res:
i['con']=trans_char(i['con'])
fps = 24000/1001
duration = '{0} --> {1}\n'
differ = difflib.SequenceMatcher()
vtt_list = ['WEBVTT\n\n']
start = res[0]
for next in res:
differ.set_seqs(start['con'],next['con'])
if differ.ratio()<0.7:
sub = duration.format(sec_to_hms(start['num']/fps),sec_to_hms(end['num']/fps))
vtt_list.append(sub+start['con'])
start = next
end = next
sub = duration.format(sec_to_hms(start['num']/fps),sec_to_hms(end['num']/fps))
vtt_list.append(sub+start['con'])
vtt = ''.join(vtt_list)
return vtt
def main(Processor:int)->None:
num = 0
path = sys.argv[1]
os.environ['PATH'] += 'D:/Portable/Bin/cuDNN;'
line = multiprocessing.Queue(10)
res = multiprocessing.Manager().list()
pool = []
for _ in range(Processor):
pool.append(multiprocessing.Process(target=rec_char,args=(line,res)))
for i in pool:
i.start()
time.sleep(4)
rawvideo = frame_pipe(path)
while (con:=rawvideo.stdout.read(1920*1080)):
line.put({'num':num,'con':cut_frame(con)})
num += 1
for _ in range(Processor):
line.put(False)
for i in pool:
i.join()
with open(path[:-3]+'vtt','w',encoding='utf8') as sub:
sub.write(get_vtt(list(res)))
if __name__=='__main__':
main(2)