forked from ryanguo13/PDFextractor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPDFextractor.py
46 lines (32 loc) · 1.53 KB
/
PDFextractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from PyPDF2 import PdfReader
import os
def extract_pdf_contents(pdf_path, save_directory):
pdf = PdfReader(pdf_path)
images = []
links = []
for page_num, page in enumerate(pdf.pages):
page_resources = page['/Resources']
if '/XObject' in page_resources:
xObject = page_resources['/XObject'].get_object()
for obj_name in xObject:
obj = xObject[obj_name].get_object()
if obj['/Subtype'] == '/Image':
images.append(obj.get_data())
obj_name_cleaned = obj_name.strip('/').replace('/', '_')
image_filename = f'image_{page_num+1}_{obj_name_cleaned}.jpg'
image_path = os.path.join(save_directory, image_filename)
os.makedirs(os.path.dirname(image_path), exist_ok=True)
with open(image_path, 'wb') as img_file:
img_file.write(obj.get_data())
print(f'Saved image to {image_path}')
if '/Annots' in page:
for annot in page['/Annots']:
annot_obj = annot.get_object()
if annot_obj['/Subtype'] == '/Link' and '/A' in annot_obj:
action = annot_obj['/A']
if action['/S'] == '/URI':
links.append(action['/URI'])
for link in links:
print("Found link:" ,link)
print("---------------------------------")
return images, links