Template request | Bug report | Generate Data Product
Tags: #pdf #extract #snippet #operations #text
Author: Minura Punchihewa
Description: This notebook provides a way to extract text from PDF files.
import io
import requests
from urllib.parse import urlparse
try:
from PyPDF2 import PdfFileReader, PdfFileWriter
except:
!pip install PyPDF2
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf_file = "https://ethereum.org/669c9e2e2027310b6b3cdce6e1c52962/Ethereum_Whitepaper_-_Buterin_2014.pdf"
def is_url(url):
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False
def get_pdf(path):
if is_url(path):
remote_file = requests.get(path).content
memory_file = io.BytesIO(remote_file)
pdf_file = PdfFileReader(memory_file)
else:
pdf_file_obj = open(path, "rb")
pdf_file = PdfFileReader(pdf_file_obj)
return pdf_file
texts = []
pdf_reader = get_pdf(pdf_file)
for page_num in range(pdf_reader.numPages):
page_obj = pdf_reader.getPage(page_num)
texts.append(page_obj.extractText())
extract_texts = "".join(texts)
print(extract_texts)