update crawler

shengquan-ni · Feb 6, 2019 · 724013d · 724013d
1 parent 4c74c27
commit 724013d
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 3 deletions.
diff --git a/webCrawler/crawlerForEatingWell.py b/webCrawler/crawlerForEatingWell.py
@@ -10,9 +10,9 @@
 lunch_file="D:\\lunch.json"
 breakfast_file="D:\\breakfast.json"
 other_file="D:\\other.json"
-cache_filename="D:\\cache.tmp"
 max_try=5
 img_src_full_star="https://images.media-allrecipes.com/EatingWellAssets/assets/svg/icon/recipe-rating-stars/eating-well-star-full.svg"
+img_null="http://images.media-allrecipes.com/images/82126.jpg"
 
 def getDataFromPage(page,dinner,lunch,breakfast,other):
     p=requests.get(page)
@@ -52,7 +52,15 @@ def getDataFromPage(page,dinner,lunch,breakfast,other):
         author=temp.text.strip()
     except:
         print("no author! url = ",page)
-        author=None
+        author="EatingWell Kitchen"
+    temp=soup.find("div",{"class":"recipeDetailSummaryImageContainer"})
+    try:
+        img_url=temp.find("a")['href']
+        if img_url==img_null:
+            img_url=None
+    except:
+        print("image not found! url = ",page)
+        img_url=None
     tags=[i.text.strip() for i in soup.find_all("span",{"class":"nutritionTag"})]
     ingredients=[i.text.strip() for i in soup.find_all("span",{"itemprop":"ingredients"})]
     try:
@@ -102,7 +110,7 @@ def getDataFromPage(page,dinner,lunch,breakfast,other):
         avg_rating/=len(comments)
     json_obj={"index":index,"title":title,"avg_rating":avg_rating,"comments":comments,\
              "related":related,"tips":tips,"cals":cals,"total_time":total_time,"prep_time":prep_time,\
-             "servings":servings,"ingredients":ingredients,"tags":tags,"summary":summary,"author":author,"source":source,"instructions":instructions}
+             "servings":servings,"ingredients":ingredients,"tags":tags,"summary":summary,"author":author,"source":source,"instructions":instructions, "image_url":img_url}
     small_index="".join(i.lower() for i in index)
     if "dinner" in small_index:
         dinner.append(json_obj)

diff --git a/webCrawler/utils.py b/webCrawler/utils.py
@@ -0,0 +1,35 @@
+import json
+dinner_file="D:\\dinner.json"
+lunch_file="D:\\lunch.json"
+breakfast_file="D:\\breakfast.json"
+other_file="D:\\other.json"
+dinner=json.load(open(dinner_file))
+breakfast=json.load(open(breakfast_file))
+lunch=json.load(open(lunch_file))
+other=json.load(open(other_file))
+file_list=[breakfast,dinner,lunch,other]
+
+def countTag(files):
+    tags=set()
+    for i in files:
+        for j in i:
+            for k in j['tags']:
+                tags.add(k)
+            for k in j['index']:
+                tags.add(k)
+    return tags
+
+def saveFiles():
+    json.dump(dinner,open(dinner_file,'w'))
+    json.dump(breakfast,open(breakfast_file,'w'))
+    json.dump(lunch,open(lunch_file,'w'))
+    json.dump(other,open(other_file,'w'))
+
+def updateSource(files):
+    count=0
+    for i in files:
+        for j in i:
+            if j['source']==None:
+                j['source']="EatingWell Kitchen"
+                count+=1
+    print("found {} dishes without source".format(count))