Ran pre-commit hooks:

- Removed trailing whitespace - Ensured end-of-file newlines - Checked YAML and JSON syntax - Prevented addition of large files - Formatted Python code with Black - Linted Python code with Flake8 - Linted JavaScript and TypeScript with ESLint - Formatted code with Prettier - Ensured no blanket "noqa" comments or use of "eval" in Python - Checked Python type annotations with MyPy
Roamify-Research · Aug 8, 2024 · baaad27 · baaad27
1 parent 49dbfed
commit baaad27
Show file tree

Hide file tree

Showing 27 changed files with 611 additions and 491 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,4 @@ backend/app/routes/__pycache__/
 backend/.venv/
 backend/app/utils/.venv/
 backend/app/utils/llama_model/
-backend/app/utils/__pycache__/
+backend/app/utils/__pycache__/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -44,4 +44,4 @@ repos:
     rev: v0.910
     hooks:
       - id: mypy
-        language_version: python3.8  # Checks type annotations in Python files using MyPy
+        language_version: python3.8  # Checks type annotations in Python files using MyPy
diff --git a/README.md b/README.md
@@ -12,7 +12,6 @@
 
 Welcome to the official repository of Roamify Chrome Extension. This repository contains the code and documentation for our innovative approach to providing personalized travel recommendations using a chrome extension.
 
-
 ## Table of Contents
 
 - [Introduction](#introduction)
@@ -21,11 +20,9 @@ Welcome to the official repository of Roamify Chrome Extension. This repository
 - [Citing This Work](#citing-this-work)
 - [Acknowledgements](#acknowledgements)
 
-
 ## Introduction
 
-Roamify aims to revolutionize the travel experience by leveraging the power of machine learning to provide personalized recommendations through a google chrome extension. 
-
+Roamify aims to revolutionize the travel experience by leveraging the power of machine learning to provide personalized recommendations through a google chrome extension.
 
 ## Directory Structure
 
@@ -79,14 +76,12 @@ Extension/
 └── requirements.txt
 ```
 
-
 ## Extension Objectives
 
 1. **User-Friendly and Minimal Design**: Ensure the extension is easy to use for individuals of all age groups by maintaining a clean and intuitive interface.
 2. **Simplified Itinerary Creation**: Provide features that make it easier for users to create and manage travel itineraries with minimal effort.
 3. **Real-Time Data Scraping**: Enable the extension to directly scrape data from the current Google tab to generate detailed and relevant itineraries.
 
-
 ## Citing This Work
 
 If you use our work in your research or project, please cite it as follows:
@@ -104,4 +99,4 @@ If you use our work in your research or project, please cite it as follows:
 
 We would like to thank our academic institution, [IIIT Delhi](https://iiitd.ac.in/), and our guide, [Dr. Dhruv Kumar](https://kudhru.github.io/) for their support and contributions to this research.
 
-For more information on our research and publications, please visit our [Roamify Machine Learning repository](https://github.com/RoamifyRedefined/Machine-Learning).
+For more information on our research and publications, please visit our [Roamify Machine Learning repository](https://github.com/RoamifyRedefined/Machine-Learning).
diff --git a/backend/.env b/backend/.env
@@ -1,4 +1,4 @@
 FLASK_APP=run.py
 FLASK_ENV=development
 FLASK_DEBUG=1
-FLASK_RUN_PORT=5000
+FLASK_RUN_PORT=5000
diff --git a/backend/app/__init__.py b/backend/app/__init__.py
@@ -3,19 +3,21 @@
 from dotenv import load_dotenv
 import os
 
+
 def create_app():
-  # Load environment variables
-  load_dotenv()
+    # Load environment variables
+    load_dotenv()
+
+    # Initialize the Flask application
+    app = Flask(__name__)
+    CORS(app)
 
-  # Initialize the Flask application
-  app = Flask(__name__)
-  CORS(app)
+    # Load configurations from config.py
+    app.config.from_object("app.config")
 
-  # Load configurations from config.py
-  app.config.from_object('app.config')
+    # Register Blueprints
+    from app.routes.process import process_bp
 
-  # Register Blueprints
-  from app.routes.process import process_bp
-  app.register_blueprint(process_bp)
+    app.register_blueprint(process_bp)
 
-  return app
+    return app
diff --git a/backend/app/config.py b/backend/app/config.py
@@ -1,6 +1,7 @@
 import os
 
+
 class Config:
-    DEBUG = os.getenv('FLASK_DEBUG', True)
-    TESTING = os.getenv('FLASK_TESTING', True)
-    SECRET_KEY = os.getenv('SECRET_KEY', 'roamify_key')
+    DEBUG = os.getenv("FLASK_DEBUG", True)
+    TESTING = os.getenv("FLASK_TESTING", True)
+    SECRET_KEY = os.getenv("SECRET_KEY", "roamify_key")
diff --git a/backend/app/routes/process.py b/backend/app/routes/process.py
@@ -3,41 +3,47 @@
 import re
 
 # Create a Blueprint
-process_bp = Blueprint('process', __name__)
+process_bp = Blueprint("process", __name__)
 
 # Pipeline Processing
 pipeline_processor = Pipeline()
 
-@process_bp.route('/process', methods=['POST', 'OPTIONS','GET'])
+
+@process_bp.route("/process", methods=["POST", "OPTIONS", "GET"])
 def process_text():
-    if request.method == 'OPTIONS':
+    if request.method == "OPTIONS":
         response = make_response()
         response.headers.add("Access-Control-Allow-Origin", "*")
         response.headers.add("Access-Control-Allow-Headers", "Content-Type")
         response.headers.add("Access-Control-Allow-Methods", "POST")
         return response
-    
-    if request.method == 'POST':
+
+    if request.method == "POST":
         data = request.get_json()
 
-        if 'text' not in data:
-            days = data['day']
+        if "text" not in data:
+            days = data["day"]
             destination_name = ""
-            formatted_data = pipeline_processor.ollama_processing(destination_name, days)
+            formatted_data = pipeline_processor.ollama_processing(
+                destination_name, days
+            )
             return jsonify(formatted_data)
-            
-        text = data['text']
-        days = data['day']
+
+        text = data["text"]
+        days = data["day"]
         # User preferences
-        historical = data['historical']
-        amusement = data['amusement']
-        natural = data['natural']
+        historical = data["historical"]
+        amusement = data["amusement"]
+        natural = data["natural"]
         # Process the text
-        formatted_data = pipeline_processor.t5_ollama_processing(text, days, historical, amusement, natural)
+        formatted_data = pipeline_processor.t5_ollama_processing(
+            text, days, historical, amusement, natural
+        )
         return jsonify(formatted_data)
 
-    elif request.method == 'GET':
-        return 'This is the process endpoint'
+    elif request.method == "GET":
+        return "This is the process endpoint"
+
 
 # Ensure to include this Blueprint in your main Flask app
 # app.register_blueprint(process_bp, url_prefix='/your_url_prefix')
diff --git a/backend/app/utils/bert_processing.py b/backend/app/utils/bert_processing.py
@@ -2,38 +2,44 @@
 from transformers import BertForQuestionAnswering
 from transformers import BertTokenizer
 
+
 class BERT_Processer:
     def __init__(self):
-        self.tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
-        self.model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
-
+        self.tokenizer = BertTokenizer.from_pretrained(
+            "bert-large-uncased-whole-word-masking-finetuned-squad"
+        )
+        self.model = BertForQuestionAnswering.from_pretrained(
+            "bert-large-uncased-whole-word-masking-finetuned-squad"
+        )
+
     def answer_question(self, question, paragraph):
-        encoding = self.tokenizer.encode_plus(text=question, text_pair=paragraph, add_special_tokens=True)
-        input_ids = encoding['input_ids']  # Token embeddings
-        token_type_ids = encoding['token_type_ids']  # Segment embeddings
+        encoding = self.tokenizer.encode_plus(
+            text=question, text_pair=paragraph, add_special_tokens=True
+        )
+        input_ids = encoding["input_ids"]  # Token embeddings
+        token_type_ids = encoding["token_type_ids"]  # Segment embeddings
         tokens = self.tokenizer.convert_ids_to_tokens(input_ids)  # Input tokens
-        
+
         input_ids = torch.tensor([input_ids])
         token_type_ids = torch.tensor([token_type_ids])
 
         # Get the start and end scores from the model
         outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids)
         start_scores = outputs.start_logits
         end_scores = outputs.end_logits
-        
+
         start_index = torch.argmax(start_scores)
         end_index = torch.argmax(end_scores) + 1
 
-        answer = ' '.join(tokens[start_index:end_index+1])
-        
-        corrected_answer = ''
+        answer = " ".join(tokens[start_index : end_index + 1])
+
+        corrected_answer = ""
 
         for word in answer.split():
-
-            #If it's a subword token
-            if word[0:2] == '##':
+            # If it's a subword token
+            if word[0:2] == "##":
                 corrected_answer += word[2:]
             else:
-                corrected_answer += ' ' + word
-        
-        return corrected_answer
+                corrected_answer += " " + word
+
+        return corrected_answer
diff --git a/backend/app/utils/llama_processing.py b/backend/app/utils/llama_processing.py
@@ -13,21 +13,23 @@
 ### Response:
 {}"""
 
+
 class LlamaProcessing:
     def __init__(self, model_path):
         self.model_path = model_path
         self.tokenizer = AutoTokenizer.from_pretrained(model_path)
         self.model = AutoModelForCausalLM.from_pretrained(
-            model_path,
-            device_map="cuda:0"
+            model_path, device_map="cuda:0"
+        )
+        self.generator = pipeline(
+            "text-generation", model=self.model, tokenizer=self.tokenizer
         )
-        self.generator = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)
 
     def predict_summary(self, text):
         prompt = alpaca_prompt.format(
             "Summarize the following Input briefly in about 2-3 lines starting with the name of the attraction.",  # instruction
             text,  # input
-            ""  # output - leave this blank for generation!
+            "",  # output - leave this blank for generation!
         )
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
 
@@ -36,30 +38,37 @@ def predict_summary(self, text):
             **inputs,
             max_new_tokens=256,  # Experiment with different values
             repetition_penalty=2.0,  # Ensure repetition_penalty is set here
-            streamer=TextStreamer(self.tokenizer)
+            streamer=TextStreamer(self.tokenizer),
         )
-        
+
         return self.tokenizer.decode(result[0], skip_special_tokens=True)
-
-    def update_summary(self, text):
 
+    def update_summary(self, text):
         prompt = alpaca_prompt.format(
             "Given the following text which includes key information about a tourist attraction, generate a concise summary in 100 words",  # Update with your specific instruction
             text,  # Input text to summarize
-            ""  # Output placeholder, leave blank for generation
+            "",  # Output placeholder, leave blank for generation
         )
 
         # Tokenize the prompt
         inputs = self.tokenizer(prompt, return_tensors="pt")
 
         # Generate text based on the prompt
-        generated_texts = self.generator(prompt, max_length=256, pad_token_id=self.tokenizer.eos_token_id,
-                                         temperature=0.7, top_p=0.9, top_k=50, num_beams=5, no_repeat_ngram_size=2,
-                                         early_stopping=True)
-        generated_text = generated_texts[0]['generated_text']
+        generated_texts = self.generator(
+            prompt,
+            max_length=256,
+            pad_token_id=self.tokenizer.eos_token_id,
+            temperature=0.7,
+            top_p=0.9,
+            top_k=50,
+            num_beams=5,
+            no_repeat_ngram_size=2,
+            early_stopping=True,
+        )
+        generated_text = generated_texts[0]["generated_text"]
         print(generated_text)
         return generated_text
-    
+
     def generate_itinerary(self):
         prompt = {
             "role": "Traveler",
@@ -76,15 +85,17 @@ def generate_itinerary(self):
                 "Notre Dame Cathedral",
                 "Champs-Elysees",
                 "Montmartre",
-                "Seine River Cruise"
-            ]
+                "Seine River Cruise",
+            ],
         }
 
         # Create a prompt for the LLaMA model
         input_prompt = f"Generate a detailed itinerary for a 3-day trip to Paris, including transportation, accommodation, and activities. The traveler is interested in Art, History, and Culture, and wants to visit the Eiffel Tower, Louvre Museum, Notre Dame Cathedral, Champs-Elysees, Montmartre, and Seine River Cruise. The budget is $500."
         # Generate the itinerary using the LLaMA model
         inputs = self.tokenizer(input_prompt, return_tensors="pt").to(self.model.device)
-        result = self.model.generate(**inputs, max_new_tokens=8192, repetition_penalty=2.0)
+        result = self.model.generate(
+            **inputs, max_new_tokens=8192, repetition_penalty=2.0
+        )
         itinerary = self.tokenizer.decode(result[0], skip_special_tokens=True)
 
         return itinerary