Skip to content

Commit dc29a7e

Browse files
authored
Merge pull request #495 from alephdata/rinat/compare-entities
Entity comparison function
2 parents 97746cd + afb9789 commit dc29a7e

File tree

6 files changed

+77
-16
lines changed

6 files changed

+77
-16
lines changed

aleph/index/xref.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def entity_query(sample, collection_id=None, query=None, broad=False):
6464
'names.text': {
6565
'query': name,
6666
'operator': 'and',
67-
'minimum_should_match': '67%',
67+
'minimum_should_match': '5%',
6868
# 'cutoff_frequency': 0.0001,
6969
# 'boost': 0.5
7070
}

aleph/logic/compare.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import itertools
2+
from Levenshtein import jaro
3+
from banal import ensure_list
4+
from followthemoney import model
5+
from followthemoney.types import registry
6+
from followthemoney.util import dampen
7+
8+
# OK, Here's the plan: we have to find a way to get user judgements
9+
# on as many of these matches as we can, then build a regression
10+
# model which properly weights the value of a matching property
11+
# based upon it's type.
12+
FP_WEIGHT = 0.6
13+
MATCH_WEIGHTS = {
14+
registry.text: 0,
15+
registry.name: 0, # because we already compare fingerprints
16+
registry.identifier: 0.4,
17+
registry.url: 0.1,
18+
registry.email: 0.3,
19+
registry.ip: 0.1,
20+
registry.iban: 0.3,
21+
registry.address: 0.2,
22+
registry.date: 0.3,
23+
registry.phone: 0.1,
24+
registry.country: 0.1,
25+
registry.language: 0.1,
26+
}
27+
28+
29+
def compare(left, right):
30+
"""Compare two entities and return number between 0 and 1.
31+
Returned number indicates probability that two entities are the same.
32+
"""
33+
left_schema = model.get(left.get('schema'))
34+
right_schema = model.get(right.get('schema'))
35+
if right_schema not in list(left_schema.matchable_schemata):
36+
return 0
37+
schema = model.precise_schema(left_schema, right_schema)
38+
score = compare_fingerprints(left, right)
39+
left_properties = left.get('properties', {})
40+
right_properties = right.get('properties', {})
41+
for name, prop in schema.properties.items():
42+
weight = MATCH_WEIGHTS.get(prop.type, 0)
43+
if weight == 0:
44+
continue
45+
left_values = left_properties.get(name)
46+
right_values = right_properties.get(name)
47+
prop_score = prop.type.compare_sets(left_values, right_values)
48+
score = score + prop_score * weight
49+
return max(0.0, min(1.0, score)) * 0.9
50+
51+
52+
def compare_fingerprints(left, right):
53+
result = 0
54+
left_list = ensure_list(left.get('fingerprints'))
55+
right_list = ensure_list(right.get('fingerprints'))
56+
for (left, right) in itertools.product(left_list, right_list):
57+
similarity = jaro(left, right)
58+
score = similarity * dampen(3, 20, min(left, right, key=len))
59+
result = max(result, score)
60+
return result * FP_WEIGHT

aleph/logic/xref.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,14 @@
77
from aleph.index.core import entities_index
88
from aleph.index.xref import entity_query
99
from aleph.index.entities import iter_entities
10-
from aleph.index.util import search_safe
10+
from aleph.index.util import search_safe, unpack_result
11+
from aleph.logic.compare import compare
1112

1213
log = logging.getLogger(__name__)
14+
EXCLUDES = ['text', 'roles']
1315

1416

15-
def _xref_item(item, collection_id=None):
17+
def xref_item(item, collection_id=None):
1618
"""Cross-reference an entity or document, given as an indexed document."""
1719
name = item.get('name') or item.get('title')
1820
query = entity_query(item, collection_id=collection_id)
@@ -21,8 +23,8 @@ def _xref_item(item, collection_id=None):
2123

2224
query = {
2325
'query': query,
24-
'size': 10,
25-
'_source': ['collection_id', 'name'],
26+
'size': 15,
27+
'_source': {'excludes': EXCLUDES}
2628
}
2729
result = search_safe(index=entities_index(), body=query)
2830
results = result.get('hits').get('hits')
@@ -40,16 +42,16 @@ def _xref_item(item, collection_id=None):
4042
dq.delete()
4143

4244
for result in results:
43-
source = result.get('_source', {})
44-
log.info("Xref [%.1f]: %s <=> %s", result.get('_score'),
45-
name, source.get('name'))
45+
result = unpack_result(result)
46+
score = compare(item, result)
47+
log.info("Xref [%.1f]: %s <=> %s", score, name, result.get('name'))
4648
obj = Match()
4749
obj.entity_id = entity_id
4850
obj.document_id = document_id
4951
obj.collection_id = item.get('collection_id')
50-
obj.match_id = result.get('_id')
51-
obj.match_collection_id = source.get('collection_id')
52-
obj.score = result.get('_score')
52+
obj.match_id = result.get('id')
53+
obj.match_collection_id = result.get('collection_id')
54+
obj.score = score
5355
db.session.add(obj)
5456
db.session.commit()
5557

@@ -60,6 +62,6 @@ def xref_collection(collection_id, other_id=None):
6062
matchable = [s.name for s in model if s.matchable]
6163
entities = iter_entities(collection_id=collection_id,
6264
schemata=matchable,
63-
excludes=['text', 'roles', 'properties.*'])
65+
excludes=EXCLUDES)
6466
for entity in entities:
65-
_xref_item(entity, collection_id=other_id)
67+
xref_item(entity, collection_id=other_id)

platform/base/Dockerfile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,4 +100,3 @@ RUN python3 -m spacy download xx
100100
# RUN python3 -m spacy download es && python3 -m spacy download pt
101101
# RUN python3 -m spacy download fr && python3 -m spacy download it
102102
# RUN python3 -m spacy download nl
103-

requirements-generic.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,4 +65,4 @@ python-magic==0.4.15
6565
odfpy==1.3.6
6666
olefile==0.45.1
6767
-e git://github.com/alephdata/flanker.git@65eb960f1eaa1fe2925b03f3740e2eff3c21dafc#egg=flanker
68-
ply==3.10
68+
ply==3.10

ui/src/screens/CollectionXrefMatchesScreen/CollectionXrefMatchesScreen.jsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ class CollectionXrefMatchesScreen extends Component {
146146
{ matches.total !== undefined && matches.results.map((match) => (
147147
<tr key={match.id}>
148148
<td className="numeric narrow">
149-
<FormattedNumber value={parseInt(match.score, 10)} />
149+
<FormattedNumber value={parseInt(parseFloat(match.score) * 100, 10)} />
150150
</td>
151151
{match.entity && (
152152
<React.Fragment>

0 commit comments

Comments
 (0)