forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAddress.cpp
20874 lines (19290 loc) · 606 KB
/
Address.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//-*- coding: utf-8 -*-
#include "Proxy.h"
class Address *g_address; // for debug
#define CRID_ANY 0
#define CRID_US 226
//
// if you have "in <city/adm1 name>" in same sentence as street then
// require that that item be a city/adm1 in any address you try to do.
// i would set "long long inPrepPhrase" to be the city/adm1 place hash.
// so if it is not zero, check for it. but add it with addProperPlaces()
// first to see if it added anything!! then we can
//
//and fix it so "1914" years and older years are pub dates!
//and inclide days of the week in pub dates like "sunday, april 11, 2004"
//too!!
//do not allow lower case 'or' in place name!
//do not allow place names starting with "arrangements by" or "sponsored by"
// test on http://alibi.com/index.php?scn=cal
// test on http://www.burtstikilounge.com/burts/
// TODO: FOR ADDRESS overlap detection, just hash every word index for
// every Place which can not be shared. then store the score and
// Address ptr as the data value, so we can do a quick compare!
// TODO: also add conflicting addresses with the same score as winners.
// if we can't resolve a winner then we should just eliminate both/all
// to be on the safe side. like the alibi.com page has both albuquerque
// and santa fe in the <title> tag so it is really just lucky that we
// pick albuquerque most of the time... we might be able to bring in
// street name to city map to help us fix this one. if both cities have
// the same street name, then nuke both! any other ideas?
// TODO: for the abqjournal.com page we need to determine the most popular
// city/adm1 pair over the whole page and use that as another default
// option. also consider if we should have several and score them...
// TODO: for all the phrases in "small" sections and all phrases following
// "at" or "at the" look those phrases up in placedb as place names
// to get their addresses. also confirm the place names we extract
// that are immediately before street names. also get all the possible
// city/adm1/ctry tuples that each place name might have. if these
// are not right next to it then i guess you need to get them from
// the title and tagdb. that way the placedb lookup can integrate
// the tuples into the key and greatly narrow the list. we may have
// to then do multiple lookups for the same place name in placedb,
// so another reason we should distribute them and keep them in memory
// or at least on an SSD. use *namedb* to index place names just like
// indexdb. then we can conduct a search for a place name on namedb
// and get the corresponding keys of the place records in placedb.
// namedb will need to be mostly in memory then!
// TODO: verify street addresses we do extract by looking up each one in
// placedb by the street. each street may have multiple city/adm1/ctry
// tuples, so this lookup should narrow it down!
// test zipcode hyphen fix on abqjournal.com/contact.html
#include "gb-include.h"
#include "Address.h"
#include "Sections.h"
//#include "DateParse2.h"
#include "Abbreviations.h"
#include "Phrases.h"
//#include "Weights.h"
#include "XmlDoc.h" // hashWords()
#include "Hostdb.h"
#include "Placedb.h"
#include "sort.h"
#include "HttpServer.h"
//#define CF_UNIQUE (((unsigned long long)1LL)<<63)
bool getBestLatLon ( RdbList *list ,
double *bestLat ,
double *bestLon ,
long *numVotes ,
long niceness ,
long winnerSnh ) ;
char *getLatLonPtrFromStr ( char *data ) ;
void getLatLonFromStr ( char *data , double *lat , double *lon);
char *getStateAbbr ( uint64_t bit ) ;
long long getWordXorHash ( char *s ) ;
long long getWordXorHash2 ( char *s ) ;
long getStateOffset ( long long *h ) ;
class StateDesc *getStateDescFromBits ( uint64_t bit ) ;
// returns 0 if not a state:
uint64_t getStateBitFromHash ( long long *h ) ;
static bool setHashes ( class Place *p , Words *ww , long niceness ) ;
static bool addIndicator ( char *s , char bit , float boost );
static bool addIndicator ( long long h , char bit , float boost );
//static void printAddress ( class Address *A , class SafeBuf *pbuf , long i);
static void printPlaces ( PlaceMem *pm , SafeBuf *pbuf ,
class Sections *sections ,
class Address *base ) ;
static bool getZipLatLon ( char *zip ,
long zipLen ,
float *zipLat ,
float *zipLon ) ;
//
// new stuff
//
static bool generatePlacesFile ( ) ;
static bool loadPlaces ( ) ;
class PlaceDesc *getState_new ( uint64_t pd64 , uint8_t crid , long niceness );
PlaceDesc *getState2_new ( char *state , uint8_t crid , long niceness ) ;
class PlaceDesc *getCity_new ( uint64_t ch64 ,
char *stateAbbr ,
uint8_t crid ,
long niceness ) ;
class PlaceDesc *getCity2_new ( char *city ,
char *stateAbbr ,
uint8_t crid ,
long niceness ) ;
PlaceDesc *getCity3_new ( uint64_t ch64 ,
uint64_t stateHash64,
uint8_t crid ,
long niceness ) ;
bool getLongestPlaceName_new ( long i,
long alnumPos,
Words *w,
// must match! PDF_CITY|STATE|COUNTRY
uint8_t placeType,
uint8_t crid, // can be CRID_ANY
char *stateAbbr, // can be NULL
uint64_t *placeHash64,
long *placeAlnumA,
long *placeAlnumB,
long *placeA,
long *placeB ,
// set to most popular match
PlaceDesc **pdp ) ;
bool getZip_new ( long a ,
long alnumPos ,
Words *words ,
uint64_t *zipHash64 ,
uint64_t *zipCityHash64 ,
uint64_t *zipStateHash64 ,
long *zipAlnumA,
long *zipAlnumB,
long *zipA,
long *zipB ,
float *zipLat,
float *zipLon) ;
PlaceDesc *getMostPopularPlace_new ( long long cityHash64,
uint8_t crid ,
uint8_t placeType,
long niceness );
char *g_pbuf = NULL;
long g_pbufSize = 0;
HashTableX g_nameTable;
char *PlaceDesc::getOfficialName ( ) {
return g_pbuf + m_officialNameOffset;
}
char *PlaceDesc::getStateName ( ) {
// get our state abbr
char buf[3];
buf[0] = m_adm1[0];
buf[1] = m_adm1[1];
buf[2] = '\0';
// does this convert to lowercase? yes... it should
uint64_t placeHash64 = getWordXorHash ( buf );
// look up the place desc for the state
PlaceDesc *sd = getPlaceDesc ( placeHash64 ,
PDF_STATE,
m_crid,
buf, // state abbr
0 ); // niceness
if ( ! sd ) return NULL;
return sd->getOfficialName();
}
const char *PlaceDesc::getCountryName ( ) {
return g_countryCode.getName ( m_crid );
}
HashTableX g_indicators;
static HashTableX g_timeZones;
static HashTableX g_cities;
static HashTableX g_states;
static HashTableX g_aliases;
static HashTableX g_zips;
char *g_cityBuf = NULL;
long g_cityBufSize = 0;
// . NOW each slot in the g_cities has a ptr to a CityDesc in SafeBuf g_cityBuf
// . so now we can put all the alternate names and aliases into the same table
class CityDesc {
public:
// set bit for each state that the city is in
uint64_t m_adm1Bits;
// for chicago, we would pick "13" since s_states[13] is illinois
char m_mostPopularState;
// "us.nm,us.ny,es.a1,...|en-nl-fi=cincinnati,es-de=cincinnatus,..."
char m_data[];
};
//bool setFromStr(Address *a,char *s,pbits_t flags ,
// Place *places , long *np , long maxPlaces, long niceness );
static uint64_t getAddressHash ( Place *street ,
Place *city ,
Place *adm1 ,
Place *zip ) ;
static void verifiedWrapper ( void *state ) ;
static void gotMsg2cReplyWrapper ( void *state , void *state2 ) ;
static void gotList2c ( void *state , RdbList *xxx , Msg5 *yyy ) ;
static void sendBackAddress ( class State2c *st ) ;
Place *g_pa = NULL;
#define MIN_POP_COUNT 500
//#define MAX_STREETS 300
//#define MAX_PLACES 3500
// i raised from 15 to 25 since "Virginia Beach" city was not being picked up
// on socialmediabeach.com
#define MAX_CITIES 25
#define MAX_ADM1 80 // 1500
#define MAX_ZIPS 5
// stock g_zips with these zip code descriptors
class ZipDesc {
public:
// . this is unique within the country code only
// . see /gb/geo/geonames/admin1Codes.txt for the list
// . remove the "CC." country code prefixing each
// . example from that file: "NL.09 Utrecht\n"
char m_adm1[2];
// a single byte country id (converted to from a 2 char country id)
//uint8_t m_crid;
// hash of the city it is in
long long m_cityHash;
// offset into g_cityBuf of the city name
long m_cityOffset;
// now we use the adm1 bits since US-only now
uint64_t m_adm1Bits;
// lat/lon of centroid. for sorting by dist when user's zip is known
float m_latitude;
float m_longitude;
//void reset() {m_crid = 0; m_adm1[0] = m_adm1[1] = 0;};
void reset() {m_adm1Bits = 0;m_adm1[0]=0; m_adm1[1]=0;};
};
static char *s_days[] = {
"sunday",
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sundays",
"mondays",
"tuesdays",
"wednesdays",
"thursdays",
"fridays",
"saturdays"
};
static StateDesc s_states[] = {
{"al","alabama","ala"},
{"ak","alaska","alas"},
{"az","arizona","ariz"},
{"ar","arkansas","ark"},
{"ca","california","calif"},
{"co","colorado","colo"},
{"ct","connecticut","conn"},
{"de","delaware","del"},
{"dc","district of columbia","d.c."},
{"fl","florida","fla"},
{"ga","georgia",NULL},
{"hi","hawaii","h.i."},
{"id","idaho","ida"},
{"il","illinois","ill"},
{"in","indiana","ind"},
{"ia","iowa",NULL},
{"ks","kansas","kan"},
{"ky","kentucky",NULL},
{"la","louisiana",NULL},
{"me","maine",NULL},
{"md","maryland",NULL},
{"ma","massachusetts","mass"},
{"mi","michigan","mich"},
{"mn","minnesota","minn"},
{"ms","mississippi","miss"},
{"mo","missouri",NULL},
{"mt","montana","mont"},
{"ne","nebraska","nebr"},
{"nv","nevada","nev"},
{"nh","new hampshire","n.h."},
{"nj","new jersey","n.j."},
{"nm","new mexico","n.m."},
{"ny","new york","n.y."},
{"nc","north carolina","n.c."},
{"nd","north dakota","n.d."},
{"oh","ohio",NULL},
{"ok","oklahoma","okla"},
{"or","oregon","ore"},
{"pa","pennsylvania","penn"},
{"ri","rhode island","r.i."},
{"sc","south carolina","s.c."},
{"sd","south dakota","s.d."},
{"tn","tennessee","tenn"},
{"tx","texas","tex"},
{"ut","utah",NULL},
{"vt","vermont",NULL},
{"va","virginia","virg"},
{"wa","washington","wash"},
{"wv","west virginia","w.v."},
{"wi","wisconsin","wis"},
{"wy","wyoming","wyo"}
};
#include "StopWords.h"
static HashTableX s_doyTable;
static bool s_doyInit = false;
long getDayOfWeek ( long long h ) {
if ( ! s_doyInit ) {
s_doyInit = initWordTable(&s_doyTable, s_days ,sizeof(s_days),
"doytbl");
if ( ! s_doyInit ) return -1;
}
// . get from table
// . score should be 1 for sunday i guess
long score = s_doyTable.getScore ( &h );
// make it 0-6
score = (score-1) % 7;
// that's it
return score;
}
// http://www.dailylobo.com/calendar/
// http://www.abqthemag.com/events.html
// http://www.abqjournal.com/calendar/default.php
// http://www.abqjournal.com/calendar/month.htm (243k! do not truncate!!)
// http://www.kasa.com/subindex/entertainment/events_calendar
// http://www.trumba.com/calendars/KRQE_Calendar.rss (rss)
// http://www.koat.com/calendar/index.html
// http://www.trumba.com/calendars/albuquerque-area-events-calendar.rss.
// http://www.google.com/calendar/embed?mode=AGENDA&height=700&wkst=1&bgcolor=%23FFFFFF&src=vn90mq4n30kodohqjv8cdn5cfg%40group.calendar.google.com&color=%237A367A
// http://www.krqe.com/subindex/features/events_calendar
// http://www.alibi.com/index.php?scn=cal
// http://www.publicbroadcasting.net/kunm/events.eventsmain
// http://www.publicbroadcasting.net/kunm/events.eventsmain?action=showCategoryListing&newSearch=true&categorySearch=4025
// http://www.770kob.com/article.asp?id=521586
// http://events.kgoradio.com/
// http://www.livenation.com/venue/journal-pavilion-tickets (journal pavilion)
// http://www.livenation.com/venue/kiva-auditorium-tickets
// http://events.kqed.org/events/
// http://www.sfbg.com/entry.php?entry_id=8401&catid=85&l=1
// http://events.sfgate.com/ (zvents.com)
// http://events.sfgate.com/search?cat=1
// http://entertainment.signonsandiego.com/search/?type=event
// http://www.sdcitybeat.com/cms/event/search/?menu=Events
// ** http://www.sandiegometro.com/calendar/arts.php
// address parsing test cases:
// http://yellowpages.superpages.com/listings.jsp?CS=L&MCBP=true&search=Find+It&SRC=&C=bicycles&STYPE=S&L=Albuquerque+NM+&x=0&y=0
// address examples:
// BRAZIL:
// Marina Costa e Silva
// Rua Afonso Canargo, 805
// Santana
// 85070-200 Guarapuava - PR
// University of New Mexico
// Department of Physics and Astronomy
// MSC07 4220
// 800 Yale Blvd NE
// Albuquerque, New Mexico 87131-0001 USA
// US-380
// Lincoln, NM
// Saturday, August 8, 2009
static bool s_init = false;
Addresses::Addresses ( ) {
m_buf = NULL;
m_bufSize = 0;
m_calledGeocoder = false;
m_xd = NULL;
m_msg2c = NULL;
m_sorted = NULL;
m_sortedValid = false;
m_breached = false;
m_numValid = 0;
}
Addresses::~Addresses ( ) {
reset();
}
void Addresses::reset ( ) {
if ( m_buf && m_bufSize )
mfree ( m_buf , m_bufSize , "adata");
m_buf = NULL;
m_bufSize = 0;
m_sb.purge();
//m_ptValid = false;
//m_msg2c.m_requests = 0;
//m_msg2c.m_replies = 0;
m_firstBreach = true;
m_breached = false;
m_numValid = 0;
m_calledGeocoder = false;
if ( m_msg2c ) {
mdelete ( m_msg2c , sizeof(Msg2c),"aamsg2c");
delete (m_msg2c);
m_msg2c = NULL;
}
// free buf
if ( m_sorted )
mfree ( m_sorted , m_sortedSize , "asortbuf");
m_sorted = NULL;
m_sortedValid = false;
m_uniqueStreetHashes = 0;
}
static long long h_court;
static long long h_i;
static long long h_interstate;
static long long h_page ;
static long long h_corner ;
static long long h_between ;
static long long h_btwn ;
static long long h_bet ;
static long long h_streets ;
static long long h_sts ;
static long long h_at ;
static long long h_come ;
static long long h_is ;
static long long h_located ;
static long long h_intersection;
static long long h_law ;
static long long h_address ;
static long long h_added ;
static long long h_copy ;
static long long h_search ;
static long long h_find ;
static long long h_go ;
static long long h_town ;
static long long h_city ;
static long long h_street ;
static long long h_telephone;
static long long h_tel ;
static long long h_ph ;
static long long h_fax ;
static long long h_where ;
static long long h_location;
static long long h_venue ;
static long long h_map ;
static long long h_office ;
static long long h_center ;
static long long h_mailing ;
static long long h_mail ;
static long long h_snail ;
static long long h_edit ;
static long long h_email ;
static long long h_phone ;
static long long h_inc ;
static long long h_llc ;
static long long h_review ;
static long long h_reviews ;
static long long h_write ;
static long long h_add ;
static long long h_view ;
static long long h_favorites ;
static long long h_more ;
static long long h_info ;
static long long h_information ;
static long long h_the ;
static long long h_in ;
static long long h_a ;
static long long h_paseo ;
static long long h_de ;
static long long h_del ;
static long long h_all ;
static long long h_rights ;
static long long h_reserved ;
static long long h_contact ;
static long long h_us ;
static long long h_by ;
static long long h_of ;
static long long h_for ;
static long long h_arrangements ;
static long long h_arranged ;
static long long h_sponsored ;
static long long h_to ;
static long long h_every ;
static long long h_p ;
static long long h_b ;
static long long h_hwy ;
static long long h_state ;
static long long h_county ;
static long long h_cnty ;
static long long h_cty ;
static long long h_road ;
static long long h_route ;
static long long h_rte ;
static long long h_rt ;
static long long h_highway ;
static long long h_hiway ;
static long long h_cr ;
static long long h_o ;
static long long h_po ;
static long long h_post ;
static long long h_box ;
static long long h_top ;
static long long h_one ;
static long long h_noon ;
static long long h_midnight ;
static long long h_daily ;
static long long h_st ;
static long long h_nd ;
static long long h_rd ;
static long long h_th ;
static long long h_away ;
static long long h_results ;
static long long h_days ;
static long long h_blocks ;
static long long h_block ;
static long long h_miles ;
static long long h_mile ;
static long long h_year ;
static long long h_years ;
static long long h_yr ;
static long long h_yrs ;
static long long h_hours ;
static long long h_hrs ;
static long long h_hour ;
static long long h_hr ;
static long long h_mi ;
static long long h_kilometers;
static long long h_km ;
static long long h_copyright ;
static long long h_and ;
static long long h_or ;
static long long h_suite ;
static long long h_ste ;
static long long h_bldg ;
static long long h_bld ;
static long long h_building ;
static long long h_unit ;
static long long h_room ;
static long long h_pier ;
static long long h_rm ;
static long long h_run ;
static long long h_ne ;
static long long h_nw ;
static long long h_se ;
static long long h_sw ;
static long long h_n ;
static long long h_s ;
static long long h_e ;
static long long h_w ;
static long long h_north;
static long long h_northeast;
static long long h_northwest;
static long long h_east;
static long long h_west;
static long long h_south;
static long long h_southeast;
static long long h_southwest;
static long long h_heart ;
static long long h_core ;
static long long h_least ;
static long long h_most ;
static long long h_this ;
static long long h_appeared ;
static long long h_role ;
static long long h_studied;
static long long h_prize;
static long long h_finish;
static long long h_door;
static long long h_entrance;
static long long h_area;
static long long h_left ;
static long long h_right ;
static long long h_stare ;
static long long h_sea ;
static long long h_discount ;
static long long h_discounted ;
static long long h_www;
static long long h_gaze ;
static long long h_look ;
static long long h_looking;
static long long h_be ;
static long long h_determined ;
static long long h_call ;
static long long h_details;
static long long h_tba;
static long long h_avenue;
static long long h_ave;
static long long h_register;
static long long h_sign;
static long long h_up;
static long long h_signup;
static long long h_tickets;
static long long h_purchase;
static long long h_get;
static long long h_enroll;
static long long h_buy;
static long long h_presale ;
static long long h_pre ;
static long long h_sale ;
static long long h_on ;
static long long h_sales ;
static long long h_end ;
static long long h_begin ;
static long long h_start ;
static long long h_am;
static long long h_fm;
// . first identifies all the "Places" using the rules above
// . then clusters the "Places" together into an "Address"
// . we use the address at the top of the page, and the site contact info,
// etc. to be defaults, so we can inherit, city, state, etc. from those
// . returns false if blocked, true otherwise. sets g_errno on error.
bool Addresses::set ( Sections *sections ,
Words *words ,
Bits *bits ,
TagRec *gr ,
Url *url ,
long long docId ,
char *coll ,
long domHash32 ,
long ip ,
//long tagPairHash ,
long niceness ,
SafeBuf *pbuf ,
void *state ,
void (*callback) (void *state) ,
uint8_t contentType ,
// from XmlDoc::ptr_addressReply in a title rec
//char *addressReply ,
//long addressReplySize ,
//bool addressReplyValid ,
char *siteTitleBuf ,
long siteTitleBufSize ,
XmlDoc *xd ) {
reset();
// save stuff
m_xd = xd;
m_sections = sections;
m_words = words;
m_wptrs = words->m_words;
m_wlens = words->m_wordLens;
m_nw = words->m_numWords;
m_wids = words->getWordIds();
m_tids = words->getTagIds();
m_bits = bits;
m_gr = gr;
m_url = url;
m_docId = docId;
m_coll = coll;
m_domHash32 = domHash32;
m_ip = ip;
//m_tagPairHash = tagPairHash;
m_niceness = niceness;
m_pbuf = pbuf;
m_state = state;
m_callback = callback;
m_contentType = contentType;
//m_addressReply = addressReply;
//m_addressReplySize = addressReplySize;
//m_addressReplyValid = addressReplyValid;
m_siteTitleBuf = siteTitleBuf;
m_siteTitleBufSize = siteTitleBufSize;
m_sb.purge();
static bool s_setHashes = false;
if ( ! s_setHashes ) {
// flag it
s_setHashes = true;
// shortcuts
h_i = hash64n ("i");
h_court = hash64n ("court");
h_interstate = hash64n ("interstate");
h_page = hash64n ("page");
h_corner = hash64n ("corner");
h_between = hash64n ( "between");
h_btwn = hash64n ( "btwn");
h_bet = hash64n ( "bet");
h_streets = hash64n ( "streets");
h_sts = hash64n ( "sts");
h_at = hash64n ( "at" );
h_come = hash64n ("come");
h_is = hash64n ( "is" );
h_located = hash64n ( "located" );
h_intersection = hash64n("intersection");
h_law = hash64 ( "law" ,3);
h_address = hash64 ( "address",7);
h_added = hash64 ( "added",5);
h_copy = hash64 ( "copy",4);
h_search = hash64 ( "search",6);
h_find = hash64 ( "find",4);
h_go = hash64 ( "go",2);
h_town = hash64n ( "town");
h_city = hash64n ( "city");
h_street = hash64 ( "street",6);
h_telephone = hash64 ( "telephone",9);
h_tel = hash64 ( "tel",3);
h_ph = hash64 ( "ph",2);
h_fax = hash64 ( "fax",3);
h_where = hash64 ( "where",5);
h_location= hash64 ( "location",8);
h_venue = hash64n("venue");
h_map = hash64 ( "map" ,3);
h_office = hash64 ( "office" ,6);
h_center = hash64n ("center");
h_mailing = hash64 ( "mailing" ,7);
h_mail = hash64 ( "mail" ,4);
h_snail = hash64 ( "snail" ,5);
h_edit = hash64 ( "edit" ,4);
h_email = hash64 ( "email" ,5);
h_phone = hash64 ( "phone" ,5);
h_inc = hash64 ( "inc" ,3);
h_llc = hash64 ( "llc" ,3);
h_review = hash64 ( "review" ,6);
h_reviews = hash64 ( "reviews" ,7);
h_write = hash64 ( "write", 5);
h_add = hash64 ( "add",3 );
h_view = hash64 ( "view", 4);
h_favorites = hash64 ( "favorites", 9);
h_more = hash64 ( "more",4 );
h_info = hash64 ( "info",4 );
h_information = hash64 ( "information", 11);
h_the = hash64 ( "the" ,3);
h_in = hash64 ( "in" ,2);
h_a = hash64 ( "a" ,1);
h_paseo = hash64n ( "paseo");
h_de = hash64n ( "de");
h_del = hash64n ( "del");
h_all = hash64 ( "all" ,3);
h_rights = hash64 ( "rights" ,6);
h_reserved = hash64 ( "reserved" ,8);
h_contact = hash64 ( "contact" , 7);
h_us = hash64 ( "us" , 2);
h_by = hash64 ( "by" ,2);
h_of = hash64 ( "of" ,2);
h_for = hash64 ( "for" ,3);
h_arrangements = hash64("arrangements",12);
h_arranged = hash64("arranged",8);
h_sponsored = hash64("sponsored",9);
h_to = hash64 ( "to" ,2);
h_every = hash64 ( "every",5);
h_p = hash64 ( "p" ,1);
h_b = hash64n ( "b" );
h_hwy = hash64 ( "hwy" ,3);
h_state = hash64 ( "state" ,5);
h_county = hash64 ( "county" , 6 );
h_cnty = hash64 ( "cnty" , 4 );
h_cty = hash64 ( "cty" , 3 );
h_road = hash64 ( "road" ,4);
h_route = hash64 ( "route" ,5);
h_rte = hash64 ( "rte" ,3);
h_rt = hash64 ( "rt" ,2);
h_highway = hash64 ( "highway" ,7);
h_hiway = hash64 ( "hiway" ,5);
h_cr = hash64 ( "cr" ,2);
h_o = hash64 ( "o" ,1);
h_po = hash64 ( "po" ,2);
h_post = hash64 ( "post" ,4);
h_box = hash64 ( "box" ,3);
h_top = hash64n ( "top" );
h_one = hash64 ( "one" ,3);
h_noon = hash64n ( "noon" );
h_midnight = hash64n ( "midnight" );
h_daily = hash64n ( "daily" );
h_st = hash64 ( "st" ,2);
h_nd = hash64 ( "nd" ,2);
h_rd = hash64 ( "rd" ,2);
h_th = hash64 ( "th" ,2);
h_away = hash64 ( "away" ,4);
h_results = hash64 ( "results" , 7 );
h_days = hash64 ( "days", 4 );
h_blocks = hash64 ( "blocks",6);
h_block = hash64 ( "block",5);
h_miles = hash64 ( "miles",5);
h_mile = hash64n ( "mile");
h_year = hash64n("year");
h_years = hash64n("years");
h_yr = hash64n("yr");
h_yrs = hash64n("yrs");
h_hours = hash64 ( "hours",5);
h_hrs = hash64 ( "hrs",3);
h_hour = hash64n ( "hour");
h_hr = hash64n ( "hr");
h_mi = hash64 ( "mi",2);
h_kilometers= hash64 ( "kilometers",10);
h_km = hash64 ( "km",2);
h_copyright = hash64 ( "copyright",9);
h_and = hash64 ( "and" , 3 );
h_or = hash64 ( "or" , 2 );
h_suite = hash64 ( "suite",5);
h_ste = hash64 ( "ste",3);
h_bldg = hash64 ( "bldg",4);
h_bld = hash64n ( "bld");
h_building = hash64 ( "building",8);
h_unit = hash64 ( "unit",4);
h_room = hash64 ( "room",4);
h_pier = hash64 ( "pier",4);
h_rm = hash64 ( "rm",2);
h_run = hash64n ("run");
h_ne = hash64 ( "ne" ,2);
h_nw = hash64 ( "nw" ,2);
h_se = hash64 ( "se" ,2);
h_sw = hash64 ( "sw" ,2);
h_n = hash64 ( "n" ,1);
h_s = hash64 ( "s" ,1);
h_e = hash64 ( "e" ,1);
h_w = hash64 ( "w" ,1);
h_north = hash64n("north");
h_south = hash64n("south");
h_east = hash64n("east");
h_west = hash64n("west");
h_northeast = hash64n("northeast");
h_northwest = hash64n("northwest");
h_southeast = hash64n("southeast");
h_southwest = hash64n("southwest");
h_heart = hash64n ( "heart" );
h_core = hash64n ( "core" );
h_least = hash64n ( "least" );
h_most = hash64n ( "most" );
h_this = hash64n ( "this" );
h_north = hash64n ( "north" );
h_south = hash64n ( "south" );
h_east = hash64n ( "east" );
h_west = hash64n ( "west" );
h_appeared = hash64n ( "appeared" );
h_role = hash64n ( "role" );
h_studied = hash64n ( "studied" );
h_prize = hash64n ( "prize" );
h_finish = hash64n("finish");
h_door = hash64n("door");
h_entrance = hash64n("entrance");
h_area = hash64n("area");
h_left = hash64n ( "left" );
h_right = hash64n ( "right" );
h_stare = hash64n ( "stare" );
h_sea = hash64n ( "sea" );
h_discount = hash64n("discount");
h_discounted = hash64n("discounted");
h_www = hash64n("www");
h_gaze = hash64n ( "gaze" );
h_look = hash64n ( "look" );
h_looking = hash64n ( "looking" );
h_be = hash64n("be");
h_determined = hash64n("determined");
h_call = hash64n("call");
h_details = hash64n("details");
h_tba = hash64n("tba");
h_avenue = hash64n("avenue");
h_ave = hash64n("ave");
h_register = hash64n("register");
h_sign = hash64n("sign");
h_up = hash64n("up");
h_signup = hash64n("signup");
h_tickets = hash64n("tickets");
h_purchase = hash64n("purchase");
h_get = hash64n("get");
h_enroll = hash64n("enroll");
h_buy = hash64n("buy");
h_presale = hash64n("presale");
h_pre = hash64n("pre");
h_sale = hash64n("sale");
h_on = hash64n("on");
h_sales = hash64n("sales");
h_end = hash64n("end");
h_begin = hash64n("begin");
h_start = hash64n("start");
h_am = hash64n("am");
h_fm = hash64n("fm");
}
//m_msg2c.m_mcast.reset();
// sanity check -- did set2() corrupt our junk?
//if ( m_msg2c.m_mcast.m_ownMsg && m_msg2c.m_mcast.m_msgSize > 5000 ){
// char *xx=NULL;*xx=0; }
// returns false and sets g_errno on error
bool status = set2 ( );
// sanity check -- did set2() corrupt our junk?
//if ( m_msg2c.m_mcast.m_ownMsg && m_msg2c.m_mcast.m_msgSize > 5000 ){
// char *xx=NULL;*xx=0; }
// sanity check
if ( ! status && ! g_errno ) { char *xx=NULL;*xx=0; }
// return true on error now
if ( ! status ) return true;
// . ok, go no further if from msg13
// . it will have to check m_good or something, not m_valid
if ( ! m_sections ) return true;
// if valid and empty, we are done
//if ( m_addressReplyValid && ! m_addressReply ) return true;
/*
-- mdw took this out because it had too many false positives. often
the place name 1 and/or 2 was wrong and was calling nonsense a
place! for many urls... and now that i removed the
SEC_CONTENDED_ADDRESS algo all the events on a page even if
different tag hashes, can share the same address. to replace
that algo i am ignore events with SEC_TITLE_OUTLINKED if the
event title is an outlink to another page, and also i am trying
to identify all place names in events. this outlinked bit should
fix the http://www.zvents.com/albuquerque-nm/events/show/88543421-the-love-song-of-j-robert-oppenheimer-by-carson-kreitzer url, since it has a
little section that has "You may Also Like..." for events at
different venues, mentioned by name.
//
// . SELF-VERIFICATION LOOPS
//
// . now use the addresses that were inlined to verify those
// that were not inlined, assuming the place name matches
// . this will allow "The Filling Station" to be verified in
// http://www.zvents.com/albuquerque-nm/events/show/
// 88543421-the-love-song-of-j-robert-oppenheimer-by-carson-kreitzer
// . first scan the addresses for inlined ones
// . logic taken basically from hashForPlacedb()
//
// init the table
HashTableX pt;
// returns true with g_errno set on error
if ( ! pt.set ( 8,4,256,NULL,0,false,m_niceness) ) return true;
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// get it
Address *a = &m_addresses[i];
// must be inlined
if ( ! ( a->m_flags & AF_INLINED ) ) continue;
// sometimes a street can exist in two cities or states
if ( a->m_flags & AF_AMBIGUOUS ) continue;
// must not have a place name in place of the street name
if ( a->m_street.m_flags2 & PLF2_IS_NAME ) continue;
// hash into table only if valid
long long h1 = a->m_name1.m_hash;
// adjust it since setHashes() xors in 0x123456 for street
// names that are actually place names in disguise
h1 ^= 0x123456;
// incorporate the adm1 and city and ctry
h1 = hash64 ( a->m_city.m_hash , h1 );
h1 = hash64 ( a->m_adm1.m_hash , h1 );
h1 = hash64 ( a->m_ctry.m_hash , h1 );
// put it in
if ( a->m_name1.m_strlen && ! pt.addKey ( (char *)&h1, &a ) )
return true;
// same for second place name
long long h2 = a->m_name2.m_hash;
// adjust it since setHashes() xors in 0x123456 for street
// names that are actually place names in disguise
h2 ^= 0x123456;
// incorporate the adm1 and city and ctry
h2 = hash64 ( a->m_city.m_hash , h2 );
h2 = hash64 ( a->m_adm1.m_hash , h2 );
h2 = hash64 ( a->m_ctry.m_hash , h2 );
// hash into table only if valid
if ( a->m_name2.m_strlen && ! pt.addKey ( (char *)&h2, &a ) )
return true;
}
// now scan our addresses that have a place name in place of
// the street name and see if we can get a match
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// get it
Address *a = &m_addresses[i];
// we want a place name in place of the street name now
if ( ! ( a->m_street.m_flags2 & PLF2_IS_NAME ) ) continue;
// . USE the STREET here, not the name
// . it should already have had the 0x123456 xor'ed in
// in the logic below because PLF2_IS_NAME is set.
long long h1 = a->m_street.m_hash;