9
9
import re
10
10
import tarfile
11
11
import uuid
12
+ import zipfile
12
13
from collections import Counter
13
14
from collections .abc import Generator
14
15
from datetime import UTC , datetime
@@ -406,6 +407,69 @@ def text_from_sjson_content(content: str):
406
407
return " " .join (data .get ("text" , []))
407
408
408
409
410
+ def _process_olx_path (olx_path : str , run : LearningResourceRun , * , overwrite ):
411
+ for document , metadata in documents_from_olx (olx_path ):
412
+ source_path = metadata .get ("source_path" )
413
+ edx_module_id = get_edx_module_id (source_path , run )
414
+ key = edx_module_id
415
+ content_type = metadata ["content_type" ]
416
+ mime_type = metadata .get ("mime_type" )
417
+ file_extension = metadata .get ("file_extension" )
418
+ existing_content = ContentFile .objects .filter (key = key , run = run ).first ()
419
+ if (
420
+ not existing_content
421
+ or existing_content .archive_checksum != metadata .get ("archive_checksum" )
422
+ ) or overwrite :
423
+ if settings .SKIP_TIKA and settings .ENVIRONMENT != "production" :
424
+ content_dict = {
425
+ "content" : "" ,
426
+ "content_title" : "" ,
427
+ }
428
+ else :
429
+ tika_output = extract_text_metadata (
430
+ document ,
431
+ other_headers = {"Content-Type" : mime_type } if mime_type else {},
432
+ )
433
+ if tika_output is None :
434
+ log .info ("No tika response for %s" , key )
435
+ continue
436
+
437
+ tika_content = tika_output .get ("content" ) or ""
438
+ tika_metadata = tika_output .get ("metadata" ) or {}
439
+ content = tika_content .strip ()
440
+ if file_extension == ".srt" :
441
+ content = text_from_srt_content (content )
442
+ elif file_extension == ".sjson" :
443
+ content = text_from_sjson_content (content )
444
+
445
+ if not content :
446
+ continue
447
+
448
+ content_dict = {
449
+ "content" : content ,
450
+ "content_title" : (
451
+ metadata .get ("title" ) or tika_metadata .get ("title" ) or ""
452
+ )[: get_max_contentfile_length ("content_title" )],
453
+ }
454
+ else :
455
+ content_dict = {
456
+ "content" : existing_content .content ,
457
+ "content_title" : existing_content .content_title ,
458
+ }
459
+ yield (
460
+ {
461
+ "key" : key ,
462
+ "published" : True ,
463
+ "content_type" : content_type ,
464
+ "archive_checksum" : metadata .get ("archive_checksum" ),
465
+ "file_extension" : file_extension ,
466
+ "source_path" : source_path ,
467
+ "edx_module_id" : edx_module_id ,
468
+ ** content_dict ,
469
+ }
470
+ )
471
+
472
+
409
473
def transform_content_files (
410
474
course_tarpath : Path , run : LearningResourceRun , * , overwrite : bool
411
475
) -> Generator [dict , None , None ]:
@@ -423,68 +487,7 @@ def transform_content_files(
423
487
with TemporaryDirectory (prefix = basedir ) as inner_tempdir :
424
488
check_call (["tar" , "xf" , course_tarpath ], cwd = inner_tempdir ) # noqa: S603,S607
425
489
olx_path = glob .glob (inner_tempdir + "/*" )[0 ] # noqa: PTH207
426
- for document , metadata in documents_from_olx (olx_path ):
427
- source_path = metadata .get ("source_path" )
428
- edx_module_id = get_edx_module_id (source_path , run )
429
- key = edx_module_id
430
- content_type = metadata ["content_type" ]
431
- mime_type = metadata .get ("mime_type" )
432
- file_extension = metadata .get ("file_extension" )
433
-
434
- existing_content = ContentFile .objects .filter (key = key , run = run ).first ()
435
- if (
436
- not existing_content
437
- or existing_content .archive_checksum != metadata .get ("archive_checksum" )
438
- ) or overwrite :
439
- if settings .SKIP_TIKA and settings .ENVIRONMENT != "production" :
440
- content_dict = {
441
- "content" : "" ,
442
- "content_title" : "" ,
443
- }
444
- else :
445
- tika_output = extract_text_metadata (
446
- document ,
447
- other_headers = {"Content-Type" : mime_type } if mime_type else {},
448
- )
449
-
450
- if tika_output is None :
451
- log .info ("No tika response for %s" , key )
452
- continue
453
-
454
- tika_content = tika_output .get ("content" ) or ""
455
- tika_metadata = tika_output .get ("metadata" ) or {}
456
- content = tika_content .strip ()
457
- if file_extension == ".srt" :
458
- content = text_from_srt_content (content )
459
- elif file_extension == ".sjson" :
460
- content = text_from_sjson_content (content )
461
-
462
- if not content :
463
- continue
464
-
465
- content_dict = {
466
- "content" : content ,
467
- "content_title" : (
468
- metadata .get ("title" ) or tika_metadata .get ("title" ) or ""
469
- )[: get_max_contentfile_length ("content_title" )],
470
- }
471
- else :
472
- content_dict = {
473
- "content" : existing_content .content ,
474
- "content_title" : existing_content .content_title ,
475
- }
476
- yield (
477
- {
478
- "key" : key ,
479
- "published" : True ,
480
- "content_type" : content_type ,
481
- "archive_checksum" : metadata .get ("archive_checksum" ),
482
- "file_extension" : file_extension ,
483
- "source_path" : source_path ,
484
- "edx_module_id" : edx_module_id ,
485
- ** content_dict ,
486
- }
487
- )
490
+ yield from _process_olx_path (olx_path , run , overwrite = overwrite )
488
491
489
492
490
493
def get_learning_course_bucket_name (etl_source : str ) -> str :
@@ -502,6 +505,7 @@ def get_learning_course_bucket_name(etl_source: str) -> str:
502
505
ETLSource .xpro .name : settings .XPRO_LEARNING_COURSE_BUCKET_NAME ,
503
506
ETLSource .mitxonline .name : settings .MITX_ONLINE_LEARNING_COURSE_BUCKET_NAME ,
504
507
ETLSource .oll .name : settings .OLL_LEARNING_COURSE_BUCKET_NAME ,
508
+ ETLSource .canvas .name : settings .CANVAS_COURSE_BUCKET_NAME ,
505
509
}
506
510
return bucket_names .get (etl_source )
507
511
@@ -536,6 +540,11 @@ def calc_checksum(filename) -> str:
536
540
Returns:
537
541
str: The md5 checksum of the file
538
542
"""
543
+ if zipfile .is_zipfile (filename ):
544
+ with zipfile .ZipFile (filename , "r" ) as zip_file :
545
+ return str (
546
+ hash (tuple (f"{ zp .filename } :{ zp .file_size } " for zp in zip_file .filelist ))
547
+ )
539
548
with tarfile .open (filename , "r" ) as tgz_file :
540
549
return str (hash (tuple (ti .chksum for ti in tgz_file .getmembers ())))
541
550
0 commit comments