Skip to content

Commit

Permalink
Merge pull request #222 from manolo-rocks/use-cors
Browse files Browse the repository at this point in the history
Use cors
  • Loading branch information
bayta-darell-1400 authored Jan 31, 2025
2 parents 0270905 + ae9c346 commit db6a3e6
Show file tree
Hide file tree
Showing 8 changed files with 177 additions and 72 deletions.
9 changes: 4 additions & 5 deletions api/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,16 @@
from django.http import HttpRequest

from manolo.celery import app
from scrapers.manolo_scraper.pipelines import process_items

from scrapers.manolo_scraper.pipelines import process_items, process_row

log = logging.getLogger(__name__)


@app.task
def process_json_request(data, institution_name: str) -> None:
def process_json_request(data) -> None:
for line in data:
items = json.loads(line)
process_items(items, institution=institution_name)
item = json.loads(line)
process_row(item)


@app.task
Expand Down
6 changes: 3 additions & 3 deletions api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,12 +176,12 @@ def save_json(request):
if is_key_valid(request) is False:
return HttpResponse("bad key")

name = request.FILES["file"].name.replace(".json", "")
institution_name = request.FILES["file"].name.replace(".json", "")
binary_data = request.FILES['file'].read()
data = binary_data.decode().splitlines()

task = process_json_request.s(data, institution_name=name)
task.apply_async(link_error=log_task_error.s(name))
task = process_json_request.s(data)
task.apply_async(link_error=log_task_error.s(institution_name))

return HttpResponse('ok')

Expand Down
8 changes: 8 additions & 0 deletions manolo/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,13 +120,20 @@
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.locale.LocaleMiddleware',
'corsheaders.middleware.CorsMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]

CORS_ALLOWED_ORIGINS = [
"https://laencerrona.pe"
]
CORS_ALLOW_IFRAME = True
X_FRAME_OPTIONS = 'ALLOW-FROM https://laencerrona.pe'

ROOT_URLCONF = 'manolo.urls'

# Python dotted path to the WSGI application used by Django's runserver.
Expand All @@ -144,6 +151,7 @@
]

THIRD_PARTY_APPS = [
"corsheaders",
"crispy_forms",
"crispy_bootstrap5",
'registration',
Expand Down
2 changes: 1 addition & 1 deletion manolo/templates/search/search.html
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@
{% for i in page.object_list %}
<tr>
<td>
<a href='{% url 'search_view' %}?q={{ i.institution }}'>
<a href='{% url 'search_view' %}?i={{ i.institution }}'>
{{ i.institution }}
</a>
</td>
Expand Down
1 change: 1 addition & 0 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ django-registration-redux
django-rest-swagger
djangorestframework-api-key
djangorestframework==3.15.2
django-cors-headers
drf_yasg
gunicorn[gevent]
gunicorn[eventlet]
Expand Down
22 changes: 16 additions & 6 deletions scrapers/manolo_scraper/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ def process_item(self, item, spider):
return item


def process_items(items, institution):
def process_items(items):
for item in items:
process_row(item, institution)
process_row(item)


def process_item(item):
Expand Down Expand Up @@ -143,21 +143,31 @@ def process_row(row):
fecha = datetime.strptime(fecha, '%Y-%m-%d').date()
id_document = row['id_document']
id_number = row['id_number']
try:
host_name, office, host_title = row['host_name'].split(' - ')
except ValueError:
try:
host_name, office = row['host_name'].split(' - ')
host_title = ''
except ValueError:
host_name = row['host_name']
office = ''
host_title = ''

item = {
'full_name': row['full_name'],
'entity': row['entity'],
"id_number": id_number,
"id_document": id_document,
'host_name': row['host_name'],
"office": row['office'],
"host_title": row['host_title'],
'host_name': host_name,
"office": office,
"host_title": host_title,
'reason': row['reason'],
"meeting_place": row['meeting_place'],
'institution': row['institution'],
"time_start": row['time_start'],
"time_end": row['time_end'],
"location": row["location"],
"location": row.get("location"),
'date': fecha,
}
item = make_hash(item)
Expand Down
155 changes: 116 additions & 39 deletions visitors/management/commands/print_dates_to_scrape.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from datetime import date, timedelta
from django.db import connection
from datetime import datetime, timedelta
from django.db.models import Q, Min, Max
from datetime import datetime

from django.core.management import BaseCommand

Expand All @@ -9,51 +13,124 @@ class Command(BaseCommand):
help = "Print dates that need scraping"

def add_arguments(self, parser):
parser.add_argument(
'-i',
'--institution',
action='store',
choices=[
'pcm',
'minjus',
'minedu',
'mincetur',
'ambiente',
'mef',
'minagr',
# to update,
'minem',
'vivienda',
'min. mujer',
'produce',
'trabajo',
]
)
pass

def handle(self, *args, **options):
institution = options['institution']
print_missing_dates(institution)
analyze_all_institutions()





def get_all_institutions():
"""Get list of all institutions in the database"""
return Visitor.objects.values_list('institution', flat=True).distinct()


def find_missing_dates_by_institution(institution, start_date=None, end_date=None):
"""
Find dates that have no visitor records for a specific institution.
Args:
institution (str): Name of the institution to check
start_date (datetime.date, optional): Start date for the search
end_date (datetime.date, optional): End date for the search
Returns:
tuple: (list of missing dates, date_range dict with min and max dates)
"""
# If no dates provided, get them from the database for this institution
if not start_date or not end_date:
date_range = Visitor.objects.filter(institution=institution).aggregate(
min_date=Min('date'),
max_date=Max('date')
)
start_date = date_range['min_date']
end_date = date_range['max_date']

if not start_date or not end_date:
return [], {'min_date': None, 'max_date': None}

# Use raw SQL for better performance
query = """
WITH RECURSIVE date_series(date) AS (
SELECT date_trunc('day', %s::timestamp)::date
UNION ALL
SELECT (date + '1 day'::interval)::date
FROM date_series
WHERE date < %s::date
),
dates_with_records AS (
SELECT DISTINCT date::date
FROM visitors_visitor
WHERE institution = %s
AND date BETWEEN %s::date AND %s::date
)
SELECT date_series.date
FROM date_series
LEFT JOIN dates_with_records ON date_series.date = dates_with_records.date
WHERE dates_with_records.date IS NULL
ORDER BY date_series.date;
"""

with connection.cursor() as cursor:
cursor.execute(query, [start_date, end_date, institution, start_date, end_date])
missing_dates = [row[0] for row in cursor.fetchall()]

return missing_dates, {'min_date': start_date, 'max_date': end_date}


def print_institution_report(institution, missing_dates, date_range):
"""
Print a formatted report of missing dates for a specific institution
"""
if not date_range['min_date']:
print(f"\n{institution}:")
print("No records found for this institution")
return

total_missing = len(missing_dates)
total_days = (date_range['max_date'] - date_range['min_date']).days + 1
coverage_percent = ((total_days - total_missing) / total_days) * 100

print(f"\n{institution}:")
print("=" * 50)
print(f"Date range: {date_range['min_date']} to {date_range['max_date']}")
print(f"Total days in range: {total_days}")
print(f"Days with no records: {total_missing}")
print(f"Coverage: {coverage_percent:.1f}%")

if missing_dates:
print("\nMissing dates by month:")
print("-" * 50)

def print_missing_dates(institution):
items = Visitor.objects.filter(
institution=institution,
).distinct('date').values('date')
current_month = None
for date in missing_dates:
month = date.strftime("%B %Y")
if month != current_month:
current_month = month
print(f"\n{month}:")
print(f" - {date.strftime('%d/%m/%Y')} ({date.strftime('%A')})")

all_dates = []
for item in items:
all_dates.append(item['date'])

all_dates = sorted(all_dates)
start_date = all_dates[0]
def analyze_all_institutions(start_date=None, end_date=None):
"""
Analyze missing dates for all institutions
"""
institutions = get_all_institutions()

now = date.today()
diff = now - start_date
if not institutions:
print("No institutions found in the database!")
return

for i in range(0, diff.days):
day = start_date + timedelta(days=i)
print("\nAnalyzing visitor records by institution...")
print(
"Date range:", f"{start_date} to {end_date}" if start_date and end_date else "Full dataset"
)

# it is a weekday 0 to 4
if day not in all_dates \
and day.weekday() < 5:
print(day)
for institution in sorted(institutions):
if institution: # Skip null values
missing_dates, date_range = find_missing_dates_by_institution(
institution, start_date, end_date
)
print_institution_report(institution, missing_dates, date_range)
46 changes: 28 additions & 18 deletions visitors/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,28 +110,38 @@ def statistics_api(request):
def search(request):
user_profile = get_user_profile(request)
query = request.GET.get('q') or ''
query = query.strip()
if len(query.split()) == 1:
single_word_query = True
else:
single_word_query = False
institution = request.GET.get('i') or ''

if query_is_dni(query):
# do dni search
all_items = do_dni_search(query)
if institution:
all_items = Visitor.objects.filter(
institution=institution,
).order_by("-date")
query = institution
else:
if single_word_query:
all_items = Visitor.objects.filter(
full_search=SearchQuery(query)
)[0:2000]
query = query.strip()

if len(query.split()) == 1:
single_word_query = True
else:
all_items = Visitor.objects.filter(
full_search=SearchQuery(query)
)
single_word_query = False

if query_is_dni(query):
# do dni search
all_items = do_dni_search(query)
else:
if single_word_query:
all_items = Visitor.objects.filter(
full_search=SearchQuery(query)
)[0:2000]
else:
all_items = Visitor.objects.filter(
full_search=SearchQuery(query)
)

# sort queryset
if not single_word_query:
all_items = do_sorting(request, all_items)

# sort queryset
if not single_word_query:
all_items = do_sorting(request, all_items)
# paginate queryset
paginator, page = do_pagination(request, all_items)

Expand Down

0 comments on commit db6a3e6

Please sign in to comment.