diff --git a/api/tasks.py b/api/tasks.py
index a1bf36c..58c7a9d 100644
--- a/api/tasks.py
+++ b/api/tasks.py
@@ -7,17 +7,16 @@
from django.http import HttpRequest
from manolo.celery import app
-from scrapers.manolo_scraper.pipelines import process_items
-
+from scrapers.manolo_scraper.pipelines import process_items, process_row
log = logging.getLogger(__name__)
@app.task
-def process_json_request(data, institution_name: str) -> None:
+def process_json_request(data) -> None:
for line in data:
- items = json.loads(line)
- process_items(items, institution=institution_name)
+ item = json.loads(line)
+ process_row(item)
@app.task
diff --git a/api/views.py b/api/views.py
index 31818dc..6eb3cca 100644
--- a/api/views.py
+++ b/api/views.py
@@ -176,12 +176,12 @@ def save_json(request):
if is_key_valid(request) is False:
return HttpResponse("bad key")
- name = request.FILES["file"].name.replace(".json", "")
+ institution_name = request.FILES["file"].name.replace(".json", "")
binary_data = request.FILES['file'].read()
data = binary_data.decode().splitlines()
- task = process_json_request.s(data, institution_name=name)
- task.apply_async(link_error=log_task_error.s(name))
+ task = process_json_request.s(data)
+ task.apply_async(link_error=log_task_error.s(institution_name))
return HttpResponse('ok')
diff --git a/manolo/settings/base.py b/manolo/settings/base.py
index fb1dbfd..a979899 100644
--- a/manolo/settings/base.py
+++ b/manolo/settings/base.py
@@ -120,6 +120,7 @@
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.locale.LocaleMiddleware',
+ 'corsheaders.middleware.CorsMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
@@ -127,6 +128,12 @@
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
+CORS_ALLOWED_ORIGINS = [
+ "https://laencerrona.pe"
+]
+CORS_ALLOW_IFRAME = True
+X_FRAME_OPTIONS = 'ALLOW-FROM https://laencerrona.pe'
+
ROOT_URLCONF = 'manolo.urls'
# Python dotted path to the WSGI application used by Django's runserver.
@@ -144,6 +151,7 @@
]
THIRD_PARTY_APPS = [
+ "corsheaders",
"crispy_forms",
"crispy_bootstrap5",
'registration',
diff --git a/manolo/templates/search/search.html b/manolo/templates/search/search.html
index 3ef67c5..d1ad614 100644
--- a/manolo/templates/search/search.html
+++ b/manolo/templates/search/search.html
@@ -119,7 +119,7 @@
{% for i in page.object_list %}
-
+
{{ i.institution }}
|
diff --git a/requirements/base.txt b/requirements/base.txt
index cd54479..d03c567 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -15,6 +15,7 @@ django-registration-redux
django-rest-swagger
djangorestframework-api-key
djangorestframework==3.15.2
+django-cors-headers
drf_yasg
gunicorn[gevent]
gunicorn[eventlet]
diff --git a/scrapers/manolo_scraper/pipelines.py b/scrapers/manolo_scraper/pipelines.py
index ba0d658..ea12f7c 100644
--- a/scrapers/manolo_scraper/pipelines.py
+++ b/scrapers/manolo_scraper/pipelines.py
@@ -47,9 +47,9 @@ def process_item(self, item, spider):
return item
-def process_items(items, institution):
+def process_items(items):
for item in items:
- process_row(item, institution)
+ process_row(item)
def process_item(item):
@@ -143,21 +143,31 @@ def process_row(row):
fecha = datetime.strptime(fecha, '%Y-%m-%d').date()
id_document = row['id_document']
id_number = row['id_number']
+ try:
+ host_name, office, host_title = row['host_name'].split(' - ')
+ except ValueError:
+ try:
+ host_name, office = row['host_name'].split(' - ')
+ host_title = ''
+ except ValueError:
+ host_name = row['host_name']
+ office = ''
+ host_title = ''
item = {
'full_name': row['full_name'],
'entity': row['entity'],
"id_number": id_number,
"id_document": id_document,
- 'host_name': row['host_name'],
- "office": row['office'],
- "host_title": row['host_title'],
+ 'host_name': host_name,
+ "office": office,
+ "host_title": host_title,
'reason': row['reason'],
"meeting_place": row['meeting_place'],
'institution': row['institution'],
"time_start": row['time_start'],
"time_end": row['time_end'],
- "location": row["location"],
+ "location": row.get("location"),
'date': fecha,
}
item = make_hash(item)
diff --git a/visitors/management/commands/print_dates_to_scrape.py b/visitors/management/commands/print_dates_to_scrape.py
index 532d2f9..87f84fb 100644
--- a/visitors/management/commands/print_dates_to_scrape.py
+++ b/visitors/management/commands/print_dates_to_scrape.py
@@ -1,4 +1,8 @@
from datetime import date, timedelta
+from django.db import connection
+from datetime import datetime, timedelta
+from django.db.models import Q, Min, Max
+from datetime import datetime
from django.core.management import BaseCommand
@@ -9,51 +13,124 @@ class Command(BaseCommand):
help = "Print dates that need scraping"
def add_arguments(self, parser):
- parser.add_argument(
- '-i',
- '--institution',
- action='store',
- choices=[
- 'pcm',
- 'minjus',
- 'minedu',
- 'mincetur',
- 'ambiente',
- 'mef',
- 'minagr',
- # to update,
- 'minem',
- 'vivienda',
- 'min. mujer',
- 'produce',
- 'trabajo',
- ]
- )
+ pass
def handle(self, *args, **options):
- institution = options['institution']
- print_missing_dates(institution)
+ analyze_all_institutions()
+
+
+
+
+
+def get_all_institutions():
+ """Get list of all institutions in the database"""
+ return Visitor.objects.values_list('institution', flat=True).distinct()
+
+
+def find_missing_dates_by_institution(institution, start_date=None, end_date=None):
+ """
+ Find dates that have no visitor records for a specific institution.
+
+ Args:
+ institution (str): Name of the institution to check
+ start_date (datetime.date, optional): Start date for the search
+ end_date (datetime.date, optional): End date for the search
+
+ Returns:
+ tuple: (list of missing dates, date_range dict with min and max dates)
+ """
+ # If no dates provided, get them from the database for this institution
+ if not start_date or not end_date:
+ date_range = Visitor.objects.filter(institution=institution).aggregate(
+ min_date=Min('date'),
+ max_date=Max('date')
+ )
+ start_date = date_range['min_date']
+ end_date = date_range['max_date']
+
+ if not start_date or not end_date:
+ return [], {'min_date': None, 'max_date': None}
+
+ # Use raw SQL for better performance
+ query = """
+ WITH RECURSIVE date_series(date) AS (
+ SELECT date_trunc('day', %s::timestamp)::date
+ UNION ALL
+ SELECT (date + '1 day'::interval)::date
+ FROM date_series
+ WHERE date < %s::date
+ ),
+ dates_with_records AS (
+ SELECT DISTINCT date::date
+ FROM visitors_visitor
+ WHERE institution = %s
+ AND date BETWEEN %s::date AND %s::date
+ )
+ SELECT date_series.date
+ FROM date_series
+ LEFT JOIN dates_with_records ON date_series.date = dates_with_records.date
+ WHERE dates_with_records.date IS NULL
+ ORDER BY date_series.date;
+ """
+
+ with connection.cursor() as cursor:
+ cursor.execute(query, [start_date, end_date, institution, start_date, end_date])
+ missing_dates = [row[0] for row in cursor.fetchall()]
+
+ return missing_dates, {'min_date': start_date, 'max_date': end_date}
+
+
+def print_institution_report(institution, missing_dates, date_range):
+ """
+ Print a formatted report of missing dates for a specific institution
+ """
+ if not date_range['min_date']:
+ print(f"\n{institution}:")
+ print("No records found for this institution")
+ return
+
+ total_missing = len(missing_dates)
+ total_days = (date_range['max_date'] - date_range['min_date']).days + 1
+ coverage_percent = ((total_days - total_missing) / total_days) * 100
+
+ print(f"\n{institution}:")
+ print("=" * 50)
+ print(f"Date range: {date_range['min_date']} to {date_range['max_date']}")
+ print(f"Total days in range: {total_days}")
+ print(f"Days with no records: {total_missing}")
+ print(f"Coverage: {coverage_percent:.1f}%")
+ if missing_dates:
+ print("\nMissing dates by month:")
+ print("-" * 50)
-def print_missing_dates(institution):
- items = Visitor.objects.filter(
- institution=institution,
- ).distinct('date').values('date')
+ current_month = None
+ for date in missing_dates:
+ month = date.strftime("%B %Y")
+ if month != current_month:
+ current_month = month
+ print(f"\n{month}:")
+ print(f" - {date.strftime('%d/%m/%Y')} ({date.strftime('%A')})")
- all_dates = []
- for item in items:
- all_dates.append(item['date'])
- all_dates = sorted(all_dates)
- start_date = all_dates[0]
+def analyze_all_institutions(start_date=None, end_date=None):
+ """
+ Analyze missing dates for all institutions
+ """
+ institutions = get_all_institutions()
- now = date.today()
- diff = now - start_date
+ if not institutions:
+ print("No institutions found in the database!")
+ return
- for i in range(0, diff.days):
- day = start_date + timedelta(days=i)
+ print("\nAnalyzing visitor records by institution...")
+ print(
+ "Date range:", f"{start_date} to {end_date}" if start_date and end_date else "Full dataset"
+ )
- # it is a weekday 0 to 4
- if day not in all_dates \
- and day.weekday() < 5:
- print(day)
+ for institution in sorted(institutions):
+ if institution: # Skip null values
+ missing_dates, date_range = find_missing_dates_by_institution(
+ institution, start_date, end_date
+ )
+ print_institution_report(institution, missing_dates, date_range)
diff --git a/visitors/views.py b/visitors/views.py
index a56532c..da74d16 100644
--- a/visitors/views.py
+++ b/visitors/views.py
@@ -110,28 +110,38 @@ def statistics_api(request):
def search(request):
user_profile = get_user_profile(request)
query = request.GET.get('q') or ''
- query = query.strip()
- if len(query.split()) == 1:
- single_word_query = True
- else:
- single_word_query = False
+ institution = request.GET.get('i') or ''
- if query_is_dni(query):
- # do dni search
- all_items = do_dni_search(query)
+ if institution:
+ all_items = Visitor.objects.filter(
+ institution=institution,
+ ).order_by("-date")
+ query = institution
else:
- if single_word_query:
- all_items = Visitor.objects.filter(
- full_search=SearchQuery(query)
- )[0:2000]
+ query = query.strip()
+
+ if len(query.split()) == 1:
+ single_word_query = True
else:
- all_items = Visitor.objects.filter(
- full_search=SearchQuery(query)
- )
+ single_word_query = False
+
+ if query_is_dni(query):
+ # do dni search
+ all_items = do_dni_search(query)
+ else:
+ if single_word_query:
+ all_items = Visitor.objects.filter(
+ full_search=SearchQuery(query)
+ )[0:2000]
+ else:
+ all_items = Visitor.objects.filter(
+ full_search=SearchQuery(query)
+ )
+
+ # sort queryset
+ if not single_word_query:
+ all_items = do_sorting(request, all_items)
- # sort queryset
- if not single_word_query:
- all_items = do_sorting(request, all_items)
# paginate queryset
paginator, page = do_pagination(request, all_items)