-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf_extractor.py
More file actions
60 lines (47 loc) · 1.54 KB
/
pdf_extractor.py
File metadata and controls
60 lines (47 loc) · 1.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from attr import s
import requests, json
import re
import pandas as pd
import pytesseract as pt
from bs4 import BeautifulSoup
from pdf2image import convert_from_bytes
def getText(url):
text = ''
response = requests.get(url, stream = True)
cnt = 0
pages = convert_from_bytes(response.raw.read())
for i in pages:
text += pt.image_to_string(i, lang='hin+mar+eng') + '\n'
cnt += 1
print(f'Pages extracted: {cnt}' + '\n')
return text
def getActualPDF(link):
response = requests.get(url = link)
soup = BeautifulSoup(response.content, 'html.parser')
for url in soup.find_all('a'):
try:
if url.get('href').endswith('.pdf'):
return url.get('href')
except:
pass
if __name__ == '__main__':
# allLinks = pd.read_csv('data/PDF_url_list_test.csv', header = None)
allLinks = pd.read_csv('data/PDF_url_list.csv', header = None)
allLinks = list(allLinks.iloc[:, 0])
pdfs_json = []
n = len(allLinks)
for i in range(n):
link = allLinks[i]
if not link.endswith('.pdf'):
allLinks[i] = getActualPDF(link)
for link in allLinks:
if link != None:
pdf_url = link
print(f'Resolving url: {pdf_url}')
pdfs_json.append({
"page-url": link,
"pdf-url": pdf_url,
"pdf-content": getText(pdf_url)
})
with open('data/pdf_extract.json', 'w') as outfile:
json.dump(pdfs_json, outfile)