Added script to scrape PDF #29

kritikaparmar-programmer · kritikaparmar-programmer · commit d45c718d67ce · 2020-10-21T22:58:35.000+05:30
diff --git a/ScrapePDF/Readme.md b/ScrapePDF/Readme.md
@@ -0,0 +1,15 @@
+# Script to scrape pdf
+
+## Overview:
+- A beginner friendly script to scrape pdf. You can easily get document info sunch as creator , crceation_date and no. of pages. Extract as many pages as you want.
+
+
+### Installing required libraries
+
+`` pip install PyPDF2 ``
+
+## How to use this script?
+
+- Direct to the ScapePDF folder in Command prompt and type the following command:  
+
+python pdfscrapper.py
diff --git a/ScrapePDF/pdfscrapper.py b/ScrapePDF/pdfscrapper.py
@@ -0,0 +1,31 @@
+# import PyPDF2 library
+import PyPDF2 as p2
+
+PDFfile = open("File path here.pdf", "rb")
+pdfread = p2.PdfFileReader(PDFfile)
+
+
+# to check wheather the pdf is encrypted or not
+print(pdfread.getIsEncrypted())
+ 
+
+# to get information about the document like creator, creation_date
+print(pdfread.getDocumentInfo())
+
+
+# to get number of pages
+print(pdfread.getNumPages())
+
+
+# To extract text from a singl page of pdf
+a = int(input("Enter the page no. from which you want to extract text: \n"))
+x = pdfread.getPage(a)
+print(x.extractText())
+
+
+# Extract entire pdf
+i = 0
+while i<pdfread.getNumPages():
+    pageinfo = pdfread.getPage(i)
+    print(pageinfo.extractText())
+    i += 1