Merge pull request #272 from Carlonii/pdftotext

DhanushNehru · web-flow · commit 80c7d7ebe472 · 2024-10-02T17:10:00.000+05:30
PDF To Text
diff --git a/PDF to text/Atividade 28 Fev.pdf b/PDF to text/Atividade 28 Fev.pdf
diff --git a/PDF to text/README.md b/PDF to text/README.md
@@ -0,0 +1,28 @@
+# PDF to Text Converter
+
+This project is a Python tool designed to convert PDF files into clean and readable text. It is built to extract text from both local and remote PDFs, perform post-processing to improve readability, and save the formatted content into `.txt` files. The project also includes features for downloading PDFs from URLs and cleaning up the extracted text to prevent issues with line breaks and disorganized spacing.
+
+---
+
+## Features
+1. **Text Extraction from Local and Remote PDFs**:
+   - Supports PDF files stored locally and PDFs available via URL.
+2. **Text Cleaning and Formatting**:
+   - Removes unwanted line breaks and excessive spacing.
+   - Preserves paragraphs and maintains the original structure.
+3. **Saving Extracted Text as `.txt` Files**:
+   - The extracted text can be saved as a `.txt` file with the same name as the original PDF.
+4. **Automatic Output Folder Creation**:
+   - Organizes generated text files into an `output_texts` folder for easy navigation and future use.
+
+## Requirements
+
+Make sure to have the following libraries installed:
+
+- `requests`
+- `PyPDF2`
+
+If you do not have them yet, install them using:
+
+```bash
+pip install requests PyPDF2
diff --git a/PDF to text/script.py b/PDF to text/script.py
@@ -0,0 +1,77 @@
+import os
+import re
+import requests
+import PyPDF2
+
+def download_pdf(url, local_filename):
+    """Download PDF from a URL to a local file."""
+    response = requests.get(url)
+    with open(local_filename, 'wb') as f:
+        f.write(response.content)
+
+def extract_text_from_pdf(pdf_path):
+    """Extract text from a single PDF file."""
+    try:
+        with open(pdf_path, 'rb') as file:
+            reader = PyPDF2.PdfReader(file)
+            text = ""
+            for page in reader.pages:
+                text += page.extract_text() or ""
+        # Apply text cleaning after extraction
+        return clean_extracted_text(text)
+    except Exception as e:
+        print(f"Failed to read {pdf_path}: {e}")
+        return None
+
+def clean_extracted_text(text):
+    """Clean and format the extracted text."""
+    # Remove line breaks in the middle of sentences
+    cleaned_text = re.sub(r'(?<!\.)\n(?!\n)', ' ', text)  # Replace single line breaks with space
+    # Remove multiple spaces
+    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
+    # Preserve paragraphs by keeping double newlines
+    cleaned_text = re.sub(r'\n{2,}', '\n\n', cleaned_text)
+    return cleaned_text.strip()
+
+def convert_pdf_to_txt(pdf_path, save_to_file=True, output_folder="output_texts"):
+    """Convert a single PDF to text, optionally saving to a file."""
+    try:
+        # Check if the path is a URL or local file
+        if pdf_path.startswith("http"):
+            # Download PDF to a temporary location
+            local_pdf = os.path.join(output_folder, pdf_path.split('/')[-1])
+            download_pdf(pdf_path, local_pdf)
+            text = extract_text_from_pdf(local_pdf)
+            os.remove(local_pdf)  # Remove the temporary file
+        else:
+            # Handle local file
+            text = extract_text_from_pdf(pdf_path)
+        
+        if text:
+            # Print the cleaned text
+            print(f"\nExtracted text:\n{text}\n")
+
+            if save_to_file:
+                # Save the extracted text to a .txt file
+                if not os.path.exists(output_folder):
+                    os.makedirs(output_folder)
+                base_name = os.path.splitext(os.path.basename(pdf_path))[0]
+                output_file = os.path.join(output_folder, f"{base_name}.txt")
+                with open(output_file, 'w', encoding='utf-8') as txt_file:
+                    txt_file.write(text)
+                print(f"Text successfully saved to: {output_file}")
+        else:
+            print(f"Could not extract text from: {pdf_path}")
+    except Exception as e:
+        print(f"Error processing {pdf_path}: {e}")
+
+# Example usage:
+
+#example pdf from internet
+#pdf = "https://fase.org.br/wp-content/uploads/2014/05/exemplo-de-pdf.pdf"
+
+#example local pdf
+pdf = "D:/repos/Python-Scripts/PDF to text/Atividade 28 Fev.pdf"
+
+# Convert PDF to text and save the cleaned text to a file
+convert_pdf_to_txt(pdf)
diff --git a/README.md b/README.md
@@ -88,6 +88,7 @@ More information on contributing and the general code of conduct for discussion
 | Password Generator                   | [Password Generator](https://github.com/DhanushNehru/Python-Scripts/tree/master/Password%20Generator)                                         | Generates a random password.                                                                                        |
 | Password Manager                     | [Password Manager](https://github.com/nem5345/Python-Scripts/tree/master/Password%20Manager)                                                  | Generate and interact with a password manager.                                                                      |
 | PDF to Audio                         | [PDF to Audio](https://github.com/DhanushNehru/Python-Scripts/tree/master/PDF%20to%20Audio)                                                   | Converts PDF to audio.                                                                                              |
+| PDF to Text                         | [PDF to text](https://github.com/DhanushNehru/Python-Scripts/tree/master/PDF%20to%20text)                                                   | Converts PDF to text.                                                                                              |
 | Planet Simulation                    | [Planet Simulation](https://github.com/DhanushNehru/Python-Scripts/tree/master/Planet%20Simulation)                                              | A simulation of several planets rotating around the sun.
 | Playlist Exchange                    | [Playlist Exchange](https://github.com/DhanushNehru/Python-Scripts/tree/master/Playlist%20Exchange)                                              | A Python script to exchange songs and playlists between Spotify and Python.   
 | PNG TO JPG CONVERTOR                    | [PNG-To-JPG](https://github.com/DhanushNehru/Python-Scripts/tree/master/PNG%20To%20JPG)                                              | A PNG TO JPG IMAGE CONVERTOR.