From 724013d87baf039626adc0ff6850b2ade9d559f6 Mon Sep 17 00:00:00 2001
From: Shengquan Ni <Hiseen@users.noreply.github.com>
Date: Tue, 5 Feb 2019 21:01:51 -0800
Subject: [PATCH] update crawler

---
 webCrawler/crawlerForEatingWell.py | 14 +++++++++---
 webCrawler/utils.py                | 35 ++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 3 deletions(-)
 create mode 100644 webCrawler/utils.py

diff --git a/webCrawler/crawlerForEatingWell.py b/webCrawler/crawlerForEatingWell.py
index 7176c38..0dc06e5 100644
--- a/webCrawler/crawlerForEatingWell.py
+++ b/webCrawler/crawlerForEatingWell.py
@@ -10,9 +10,9 @@
 lunch_file="D:\\lunch.json"
 breakfast_file="D:\\breakfast.json"
 other_file="D:\\other.json"
-cache_filename="D:\\cache.tmp"
 max_try=5
 img_src_full_star="https://images.media-allrecipes.com/EatingWellAssets/assets/svg/icon/recipe-rating-stars/eating-well-star-full.svg"
+img_null="http://images.media-allrecipes.com/images/82126.jpg"
 
 def getDataFromPage(page,dinner,lunch,breakfast,other):
     p=requests.get(page)
@@ -52,7 +52,15 @@ def getDataFromPage(page,dinner,lunch,breakfast,other):
         author=temp.text.strip()
     except:
         print("no author! url = ",page)
-        author=None
+        author="EatingWell Kitchen"
+    temp=soup.find("div",{"class":"recipeDetailSummaryImageContainer"})
+    try:
+        img_url=temp.find("a")['href']
+        if img_url==img_null:
+            img_url=None
+    except:
+        print("image not found! url = ",page)
+        img_url=None
     tags=[i.text.strip() for i in soup.find_all("span",{"class":"nutritionTag"})]
     ingredients=[i.text.strip() for i in soup.find_all("span",{"itemprop":"ingredients"})]
     try:
@@ -102,7 +110,7 @@ def getDataFromPage(page,dinner,lunch,breakfast,other):
         avg_rating/=len(comments)
     json_obj={"index":index,"title":title,"avg_rating":avg_rating,"comments":comments,\
              "related":related,"tips":tips,"cals":cals,"total_time":total_time,"prep_time":prep_time,\
-             "servings":servings,"ingredients":ingredients,"tags":tags,"summary":summary,"author":author,"source":source,"instructions":instructions}
+             "servings":servings,"ingredients":ingredients,"tags":tags,"summary":summary,"author":author,"source":source,"instructions":instructions, "image_url":img_url}
     small_index="".join(i.lower() for i in index)
     if "dinner" in small_index:
         dinner.append(json_obj)
diff --git a/webCrawler/utils.py b/webCrawler/utils.py
new file mode 100644
index 0000000..177d82f
--- /dev/null
+++ b/webCrawler/utils.py
@@ -0,0 +1,35 @@
+import json
+dinner_file="D:\\dinner.json"
+lunch_file="D:\\lunch.json"
+breakfast_file="D:\\breakfast.json"
+other_file="D:\\other.json"
+dinner=json.load(open(dinner_file))
+breakfast=json.load(open(breakfast_file))
+lunch=json.load(open(lunch_file))
+other=json.load(open(other_file))
+file_list=[breakfast,dinner,lunch,other]
+
+def countTag(files):
+    tags=set()
+    for i in files:
+        for j in i:
+            for k in j['tags']:
+                tags.add(k)
+            for k in j['index']:
+                tags.add(k)
+    return tags
+
+def saveFiles():
+    json.dump(dinner,open(dinner_file,'w'))
+    json.dump(breakfast,open(breakfast_file,'w'))
+    json.dump(lunch,open(lunch_file,'w'))
+    json.dump(other,open(other_file,'w'))
+
+def updateSource(files):
+    count=0
+    for i in files:
+        for j in i:
+            if j['source']==None:
+                j['source']="EatingWell Kitchen"
+                count+=1
+    print("found {} dishes without source".format(count))
\ No newline at end of file