-
Notifications
You must be signed in to change notification settings - Fork 60
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update the parse function to accept an entity id #189
Changes from 16 commits
e7d6fde
b9e2b60
e1cf239
d2baf9d
9c46a11
95ff207
eab0424
33c2182
b592e8b
f378a14
880d1ef
a56e43a
3203a51
1196764
5acee7f
b130665
dc45471
8a88907
4d570c3
0d7e6ec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,6 +45,10 @@ pipeline: | |
author: .//meta[@name="author"]/@content | ||
publishedAt: .//*[@class="date"]/text() | ||
description: .//meta[@property="og:description"]/@content | ||
keys: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May be the method needs to be changed to use the built-in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. The problem with doing that though is that the scraper won't be able to extract the body of the article, which is why the custom script exists. I guess, as it's an example it doesn't really matter too much, but that is why we have a difference. Personally I don't have an issue with having the documentation not match the example in the repo |
||
- title | ||
- author | ||
- publishedAt | ||
handle: | ||
store: store | ||
fetch: fetch | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
from pathlib import Path | ||
from pprint import pprint # noqa | ||
from pprint import pprint | ||
from typing import Optional # noqa | ||
from banal import clean_dict # type: ignore | ||
|
||
from alephclient import settings | ||
from alephclient.api import AlephAPI | ||
|
@@ -9,59 +11,84 @@ | |
from servicelayer.cache import make_key # type: ignore | ||
|
||
from memorious.core import get_rate_limit # type: ignore | ||
|
||
|
||
def _create_document_metadata(context, data) -> dict: | ||
meta = {} | ||
languages = context.params.get("languages") | ||
meta["languages"] = ensure_list(data.get("languages", languages)) | ||
countries = context.params.get("countries") | ||
meta["countries"] = ensure_list(data.get("countries", countries)) | ||
mime_type = context.params.get("mime_type") | ||
meta["mime_type"] = data.get("mime_type", mime_type) | ||
return meta | ||
|
||
|
||
def _create_meta_object(context, data) -> dict: | ||
from memorious.logic.context import Context | ||
|
||
|
||
class Meta(MetaBase, total=False): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. MetaBase is undefined |
||
crawler: Optional[str] | ||
foreign_id: Optional[str] | ||
source_url: Optional[str] | ||
title: Optional[str] | ||
author: Optional[str] | ||
publisher: Optional[str] | ||
file_name: Optional[str] | ||
retrieved_at: Optional[str] | ||
modified_at: Optional[str] | ||
published_at: Optional[str] | ||
headers: any | ||
keywords: any | ||
parent: any | ||
languages: any | ||
countries: any | ||
mime_type: any | ||
|
||
|
||
def _create_meta_object(context: Context, data: dict) -> Meta: | ||
languages_default: list[str] = list(context.params.get("languages", [])) | ||
countries_default: list[str] = list(context.params.get("countries", [])) | ||
mime_type_default: str = context.params.get("mime_type", "") | ||
|
||
languages = data.get("languages", languages_default) | ||
countries = data.get("countries", countries_default) | ||
mime_type = data.get("mime_type", mime_type_default) | ||
source_url = data.get("source_url", data.get("url")) | ||
foreign_id = data.get("foreign_id", data.get("request_id", source_url)) | ||
|
||
meta = { | ||
"crawler": context.crawler.name, | ||
"foreign_id": foreign_id, | ||
"source_url": source_url, | ||
"title": data.get("title"), | ||
"author": data.get("author"), | ||
"publisher": data.get("publisher"), | ||
"file_name": data.get("file_name"), | ||
"retrieved_at": data.get("retrieved_at"), | ||
"modified_at": data.get("modified_at"), | ||
"published_at": data.get("published_at"), | ||
"headers": ensure_dict(data.get("headers")), | ||
"keywords": ensure_list(data.get("keywords")), | ||
} | ||
parent = {} | ||
|
||
if data.get("aleph_folder_id"): | ||
meta["parent"] = {"id": data.get("aleph_folder_id")} | ||
parent = {"id": data.get("aleph_folder_id")} | ||
|
||
meta = Meta( | ||
crawler=context.crawler.name, | ||
foreign_id=foreign_id, | ||
source_url=source_url, | ||
title=data.get("title"), | ||
author=data.get("author"), | ||
publisher=data.get("publisher"), | ||
file_name=data.get("file_name"), | ||
retrieved_at=data.get("retrieved_at"), | ||
modified_at=data.get("modified_at"), | ||
published_at=data.get("published_at"), | ||
headers=data.get("headers", {}), | ||
keywords=data.get("keywords", []), | ||
parent=parent, | ||
languages=languages, | ||
countries=countries, | ||
mime_type=mime_type, | ||
) | ||
|
||
return meta | ||
|
||
|
||
def aleph_emit(context, data): | ||
def aleph_emit(context: Context, data: dict): | ||
aleph_emit_document(context, data) | ||
|
||
|
||
def aleph_emit_document(context, data): | ||
def aleph_emit_document(context: Context, data: dict): | ||
api = get_api(context) | ||
if api is None: | ||
return | ||
collection_id = get_collection_id(context, api) | ||
content_hash = data.get("content_hash") | ||
source_url = data.get("source_url", data.get("url")) | ||
foreign_id = data.get("foreign_id", data.get("request_id", source_url)) | ||
# Fetch document id from cache | ||
document = context.get_tag(make_key(collection_id, foreign_id, content_hash)) | ||
if isinstance(document, dict): | ||
|
||
collection_id: str = get_collection_id(context, api) | ||
content_hash: Optional[str] = data.get("content_hash") | ||
source_url: str = data.get("source_url", data.get("url")) | ||
foreign_id: str = data.get("foreign_id", data.get("request_id", source_url)) | ||
document_id: Optional[str] = context.get_tag( | ||
make_key(collection_id, foreign_id, content_hash) | ||
) | ||
|
||
if document_id: | ||
context.log.info("Skip aleph upload: %s", foreign_id) | ||
data["aleph_id"] = document["id"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
data["aleph_document"] = document | ||
|
@@ -70,10 +97,9 @@ def aleph_emit_document(context, data): | |
return | ||
|
||
meta = clean_dict(_create_meta_object(context, data)) | ||
meta.update(_create_document_metadata(context, data)) | ||
|
||
label = meta.get("file_name", meta.get("source_url")) | ||
context.log.info("Upload: %s", label) | ||
|
||
with context.load_file(content_hash) as fh: | ||
if fh is None: | ||
return | ||
|
@@ -102,7 +128,7 @@ def aleph_emit_document(context, data): | |
backoff(exc, try_number) | ||
|
||
|
||
def aleph_folder(context, data): | ||
def aleph_folder(context: Context, data: dict): | ||
api = get_api(context) | ||
if api is None: | ||
return | ||
|
@@ -136,19 +162,29 @@ def aleph_folder(context, data): | |
backoff(ae, try_number) | ||
|
||
|
||
def aleph_emit_entity(context, data): | ||
def aleph_emit_entity(context: Context, data: dict) -> None: | ||
context.log.info("Emit to entity: {}".format(data.get("entity_id"))) | ||
|
||
api = get_api(context) | ||
if api is None: | ||
return | ||
collection_id = get_collection_id(context, api) | ||
entity_id = data.get("entity_id") | ||
source_url = data.get("source_url", data.get("url")) | ||
foreign_id = data.get("foreign_id", data.get("request_id", source_url)) | ||
collection_id: str = get_collection_id(context, api) | ||
entity_id: Optional[str] = data.get("entity_id") | ||
source_url: Optional[str] = data.get("source_url", data.get("url")) | ||
foreign_id: Optional[str] = data.get( | ||
"foreign_id", data.get("request_id", source_url) | ||
) | ||
|
||
# Fetch id from cache | ||
if entity_id is None: | ||
context.log.warn("No entity_id found. Skipping store") | ||
context.emit(data=data, optional=True) | ||
return | ||
|
||
cached_key = context.get_tag(make_key(collection_id, foreign_id, entity_id)) | ||
|
||
if cached_key: | ||
context.log.info("Skip entity creation: {}".format(foreign_id)) | ||
context.log.info("Entity exists. Skip creation: {}".format(cached_key)) | ||
data["aleph_id"] = cached_key | ||
context.emit(data=data, optional=True) | ||
return | ||
|
@@ -158,7 +194,7 @@ def aleph_emit_entity(context, data): | |
rate_limit = get_rate_limit("aleph", limit=rate) | ||
rate_limit.comply() | ||
try: | ||
res = api.write_entity( | ||
res: dict[str, str] = api.write_entity( | ||
collection_id, | ||
{ | ||
"schema": data.get("schema"), | ||
|
@@ -168,7 +204,7 @@ def aleph_emit_entity(context, data): | |
) | ||
|
||
aleph_id = res.get("id") | ||
context.log.info("Aleph entity ID: %s", aleph_id) | ||
context.log.info("Entity created. entity_id is: %s", aleph_id) | ||
|
||
# Save the entity id in cache for future use | ||
context.set_tag(make_key(collection_id, foreign_id, entity_id), aleph_id) | ||
|
@@ -184,19 +220,19 @@ def aleph_emit_entity(context, data): | |
backoff(exc, try_number) | ||
|
||
|
||
def get_api(context): | ||
def get_api(context: Context) -> Optional[AlephAPI]: | ||
if not settings.HOST: | ||
context.log.warning("No $ALEPHCLIENT_HOST, skipping upload...") | ||
return None | ||
if not settings.API_KEY: | ||
context.log.warning("No $ALEPHCLIENT_API_KEY, skipping upload...") | ||
return None | ||
|
||
session_id = "memorious:%s" % context.crawler.name | ||
session_id: str = "memorious:%s" % context.crawler.name | ||
return AlephAPI(settings.HOST, settings.API_KEY, session_id=session_id) | ||
|
||
|
||
def get_collection_id(context, api): | ||
def get_collection_id(context: Context, api: AlephAPI) -> str: | ||
if not hasattr(context.stage, "aleph_cid"): | ||
foreign_id = context.get("collection", context.crawler.name) | ||
config = {"label": context.crawler.description} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would suggest we use syntax similar to ftm mappings for consistency. Something like https://github.com/alephdata/aleph/blob/main/mappings/md_companies.yml#L15-L17
So the keys section will look like:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
keys
section needs to be updated now I think?