diff --git a/api/tasks.py b/api/tasks.py index a1bf36c..58c7a9d 100644 --- a/api/tasks.py +++ b/api/tasks.py @@ -7,17 +7,16 @@ from django.http import HttpRequest from manolo.celery import app -from scrapers.manolo_scraper.pipelines import process_items - +from scrapers.manolo_scraper.pipelines import process_items, process_row log = logging.getLogger(__name__) @app.task -def process_json_request(data, institution_name: str) -> None: +def process_json_request(data) -> None: for line in data: - items = json.loads(line) - process_items(items, institution=institution_name) + item = json.loads(line) + process_row(item) @app.task diff --git a/api/views.py b/api/views.py index 31818dc..6eb3cca 100644 --- a/api/views.py +++ b/api/views.py @@ -176,12 +176,12 @@ def save_json(request): if is_key_valid(request) is False: return HttpResponse("bad key") - name = request.FILES["file"].name.replace(".json", "") + institution_name = request.FILES["file"].name.replace(".json", "") binary_data = request.FILES['file'].read() data = binary_data.decode().splitlines() - task = process_json_request.s(data, institution_name=name) - task.apply_async(link_error=log_task_error.s(name)) + task = process_json_request.s(data) + task.apply_async(link_error=log_task_error.s(institution_name)) return HttpResponse('ok') diff --git a/manolo/settings/base.py b/manolo/settings/base.py index fb1dbfd..a979899 100644 --- a/manolo/settings/base.py +++ b/manolo/settings/base.py @@ -120,6 +120,7 @@ 'django.middleware.security.SecurityMiddleware', 'django.contrib.sessions.middleware.SessionMiddleware', 'django.middleware.locale.LocaleMiddleware', + 'corsheaders.middleware.CorsMiddleware', 'django.middleware.common.CommonMiddleware', 'django.middleware.csrf.CsrfViewMiddleware', 'django.contrib.auth.middleware.AuthenticationMiddleware', @@ -127,6 +128,12 @@ 'django.middleware.clickjacking.XFrameOptionsMiddleware', ] +CORS_ALLOWED_ORIGINS = [ + "https://laencerrona.pe" +] +CORS_ALLOW_IFRAME = True +X_FRAME_OPTIONS = 'ALLOW-FROM https://laencerrona.pe' + ROOT_URLCONF = 'manolo.urls' # Python dotted path to the WSGI application used by Django's runserver. @@ -144,6 +151,7 @@ ] THIRD_PARTY_APPS = [ + "corsheaders", "crispy_forms", "crispy_bootstrap5", 'registration', diff --git a/manolo/templates/search/search.html b/manolo/templates/search/search.html index 3ef67c5..d1ad614 100644 --- a/manolo/templates/search/search.html +++ b/manolo/templates/search/search.html @@ -119,7 +119,7 @@ {% for i in page.object_list %} - + {{ i.institution }} diff --git a/requirements/base.txt b/requirements/base.txt index cd54479..d03c567 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -15,6 +15,7 @@ django-registration-redux django-rest-swagger djangorestframework-api-key djangorestframework==3.15.2 +django-cors-headers drf_yasg gunicorn[gevent] gunicorn[eventlet] diff --git a/scrapers/manolo_scraper/pipelines.py b/scrapers/manolo_scraper/pipelines.py index ba0d658..ea12f7c 100644 --- a/scrapers/manolo_scraper/pipelines.py +++ b/scrapers/manolo_scraper/pipelines.py @@ -47,9 +47,9 @@ def process_item(self, item, spider): return item -def process_items(items, institution): +def process_items(items): for item in items: - process_row(item, institution) + process_row(item) def process_item(item): @@ -143,21 +143,31 @@ def process_row(row): fecha = datetime.strptime(fecha, '%Y-%m-%d').date() id_document = row['id_document'] id_number = row['id_number'] + try: + host_name, office, host_title = row['host_name'].split(' - ') + except ValueError: + try: + host_name, office = row['host_name'].split(' - ') + host_title = '' + except ValueError: + host_name = row['host_name'] + office = '' + host_title = '' item = { 'full_name': row['full_name'], 'entity': row['entity'], "id_number": id_number, "id_document": id_document, - 'host_name': row['host_name'], - "office": row['office'], - "host_title": row['host_title'], + 'host_name': host_name, + "office": office, + "host_title": host_title, 'reason': row['reason'], "meeting_place": row['meeting_place'], 'institution': row['institution'], "time_start": row['time_start'], "time_end": row['time_end'], - "location": row["location"], + "location": row.get("location"), 'date': fecha, } item = make_hash(item) diff --git a/visitors/management/commands/print_dates_to_scrape.py b/visitors/management/commands/print_dates_to_scrape.py index 532d2f9..87f84fb 100644 --- a/visitors/management/commands/print_dates_to_scrape.py +++ b/visitors/management/commands/print_dates_to_scrape.py @@ -1,4 +1,8 @@ from datetime import date, timedelta +from django.db import connection +from datetime import datetime, timedelta +from django.db.models import Q, Min, Max +from datetime import datetime from django.core.management import BaseCommand @@ -9,51 +13,124 @@ class Command(BaseCommand): help = "Print dates that need scraping" def add_arguments(self, parser): - parser.add_argument( - '-i', - '--institution', - action='store', - choices=[ - 'pcm', - 'minjus', - 'minedu', - 'mincetur', - 'ambiente', - 'mef', - 'minagr', - # to update, - 'minem', - 'vivienda', - 'min. mujer', - 'produce', - 'trabajo', - ] - ) + pass def handle(self, *args, **options): - institution = options['institution'] - print_missing_dates(institution) + analyze_all_institutions() + + + + + +def get_all_institutions(): + """Get list of all institutions in the database""" + return Visitor.objects.values_list('institution', flat=True).distinct() + + +def find_missing_dates_by_institution(institution, start_date=None, end_date=None): + """ + Find dates that have no visitor records for a specific institution. + + Args: + institution (str): Name of the institution to check + start_date (datetime.date, optional): Start date for the search + end_date (datetime.date, optional): End date for the search + + Returns: + tuple: (list of missing dates, date_range dict with min and max dates) + """ + # If no dates provided, get them from the database for this institution + if not start_date or not end_date: + date_range = Visitor.objects.filter(institution=institution).aggregate( + min_date=Min('date'), + max_date=Max('date') + ) + start_date = date_range['min_date'] + end_date = date_range['max_date'] + + if not start_date or not end_date: + return [], {'min_date': None, 'max_date': None} + + # Use raw SQL for better performance + query = """ + WITH RECURSIVE date_series(date) AS ( + SELECT date_trunc('day', %s::timestamp)::date + UNION ALL + SELECT (date + '1 day'::interval)::date + FROM date_series + WHERE date < %s::date + ), + dates_with_records AS ( + SELECT DISTINCT date::date + FROM visitors_visitor + WHERE institution = %s + AND date BETWEEN %s::date AND %s::date + ) + SELECT date_series.date + FROM date_series + LEFT JOIN dates_with_records ON date_series.date = dates_with_records.date + WHERE dates_with_records.date IS NULL + ORDER BY date_series.date; + """ + + with connection.cursor() as cursor: + cursor.execute(query, [start_date, end_date, institution, start_date, end_date]) + missing_dates = [row[0] for row in cursor.fetchall()] + + return missing_dates, {'min_date': start_date, 'max_date': end_date} + + +def print_institution_report(institution, missing_dates, date_range): + """ + Print a formatted report of missing dates for a specific institution + """ + if not date_range['min_date']: + print(f"\n{institution}:") + print("No records found for this institution") + return + + total_missing = len(missing_dates) + total_days = (date_range['max_date'] - date_range['min_date']).days + 1 + coverage_percent = ((total_days - total_missing) / total_days) * 100 + + print(f"\n{institution}:") + print("=" * 50) + print(f"Date range: {date_range['min_date']} to {date_range['max_date']}") + print(f"Total days in range: {total_days}") + print(f"Days with no records: {total_missing}") + print(f"Coverage: {coverage_percent:.1f}%") + if missing_dates: + print("\nMissing dates by month:") + print("-" * 50) -def print_missing_dates(institution): - items = Visitor.objects.filter( - institution=institution, - ).distinct('date').values('date') + current_month = None + for date in missing_dates: + month = date.strftime("%B %Y") + if month != current_month: + current_month = month + print(f"\n{month}:") + print(f" - {date.strftime('%d/%m/%Y')} ({date.strftime('%A')})") - all_dates = [] - for item in items: - all_dates.append(item['date']) - all_dates = sorted(all_dates) - start_date = all_dates[0] +def analyze_all_institutions(start_date=None, end_date=None): + """ + Analyze missing dates for all institutions + """ + institutions = get_all_institutions() - now = date.today() - diff = now - start_date + if not institutions: + print("No institutions found in the database!") + return - for i in range(0, diff.days): - day = start_date + timedelta(days=i) + print("\nAnalyzing visitor records by institution...") + print( + "Date range:", f"{start_date} to {end_date}" if start_date and end_date else "Full dataset" + ) - # it is a weekday 0 to 4 - if day not in all_dates \ - and day.weekday() < 5: - print(day) + for institution in sorted(institutions): + if institution: # Skip null values + missing_dates, date_range = find_missing_dates_by_institution( + institution, start_date, end_date + ) + print_institution_report(institution, missing_dates, date_range) diff --git a/visitors/views.py b/visitors/views.py index a56532c..da74d16 100644 --- a/visitors/views.py +++ b/visitors/views.py @@ -110,28 +110,38 @@ def statistics_api(request): def search(request): user_profile = get_user_profile(request) query = request.GET.get('q') or '' - query = query.strip() - if len(query.split()) == 1: - single_word_query = True - else: - single_word_query = False + institution = request.GET.get('i') or '' - if query_is_dni(query): - # do dni search - all_items = do_dni_search(query) + if institution: + all_items = Visitor.objects.filter( + institution=institution, + ).order_by("-date") + query = institution else: - if single_word_query: - all_items = Visitor.objects.filter( - full_search=SearchQuery(query) - )[0:2000] + query = query.strip() + + if len(query.split()) == 1: + single_word_query = True else: - all_items = Visitor.objects.filter( - full_search=SearchQuery(query) - ) + single_word_query = False + + if query_is_dni(query): + # do dni search + all_items = do_dni_search(query) + else: + if single_word_query: + all_items = Visitor.objects.filter( + full_search=SearchQuery(query) + )[0:2000] + else: + all_items = Visitor.objects.filter( + full_search=SearchQuery(query) + ) + + # sort queryset + if not single_word_query: + all_items = do_sorting(request, all_items) - # sort queryset - if not single_word_query: - all_items = do_sorting(request, all_items) # paginate queryset paginator, page = do_pagination(request, all_items)