From 724013d87baf039626adc0ff6850b2ade9d559f6 Mon Sep 17 00:00:00 2001 From: Shengquan Ni Date: Tue, 5 Feb 2019 21:01:51 -0800 Subject: [PATCH] update crawler --- webCrawler/crawlerForEatingWell.py | 14 +++++++++--- webCrawler/utils.py | 35 ++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 webCrawler/utils.py diff --git a/webCrawler/crawlerForEatingWell.py b/webCrawler/crawlerForEatingWell.py index 7176c38..0dc06e5 100644 --- a/webCrawler/crawlerForEatingWell.py +++ b/webCrawler/crawlerForEatingWell.py @@ -10,9 +10,9 @@ lunch_file="D:\\lunch.json" breakfast_file="D:\\breakfast.json" other_file="D:\\other.json" -cache_filename="D:\\cache.tmp" max_try=5 img_src_full_star="https://images.media-allrecipes.com/EatingWellAssets/assets/svg/icon/recipe-rating-stars/eating-well-star-full.svg" +img_null="http://images.media-allrecipes.com/images/82126.jpg" def getDataFromPage(page,dinner,lunch,breakfast,other): p=requests.get(page) @@ -52,7 +52,15 @@ def getDataFromPage(page,dinner,lunch,breakfast,other): author=temp.text.strip() except: print("no author! url = ",page) - author=None + author="EatingWell Kitchen" + temp=soup.find("div",{"class":"recipeDetailSummaryImageContainer"}) + try: + img_url=temp.find("a")['href'] + if img_url==img_null: + img_url=None + except: + print("image not found! url = ",page) + img_url=None tags=[i.text.strip() for i in soup.find_all("span",{"class":"nutritionTag"})] ingredients=[i.text.strip() for i in soup.find_all("span",{"itemprop":"ingredients"})] try: @@ -102,7 +110,7 @@ def getDataFromPage(page,dinner,lunch,breakfast,other): avg_rating/=len(comments) json_obj={"index":index,"title":title,"avg_rating":avg_rating,"comments":comments,\ "related":related,"tips":tips,"cals":cals,"total_time":total_time,"prep_time":prep_time,\ - "servings":servings,"ingredients":ingredients,"tags":tags,"summary":summary,"author":author,"source":source,"instructions":instructions} + "servings":servings,"ingredients":ingredients,"tags":tags,"summary":summary,"author":author,"source":source,"instructions":instructions, "image_url":img_url} small_index="".join(i.lower() for i in index) if "dinner" in small_index: dinner.append(json_obj) diff --git a/webCrawler/utils.py b/webCrawler/utils.py new file mode 100644 index 0000000..177d82f --- /dev/null +++ b/webCrawler/utils.py @@ -0,0 +1,35 @@ +import json +dinner_file="D:\\dinner.json" +lunch_file="D:\\lunch.json" +breakfast_file="D:\\breakfast.json" +other_file="D:\\other.json" +dinner=json.load(open(dinner_file)) +breakfast=json.load(open(breakfast_file)) +lunch=json.load(open(lunch_file)) +other=json.load(open(other_file)) +file_list=[breakfast,dinner,lunch,other] + +def countTag(files): + tags=set() + for i in files: + for j in i: + for k in j['tags']: + tags.add(k) + for k in j['index']: + tags.add(k) + return tags + +def saveFiles(): + json.dump(dinner,open(dinner_file,'w')) + json.dump(breakfast,open(breakfast_file,'w')) + json.dump(lunch,open(lunch_file,'w')) + json.dump(other,open(other_file,'w')) + +def updateSource(files): + count=0 + for i in files: + for j in i: + if j['source']==None: + j['source']="EatingWell Kitchen" + count+=1 + print("found {} dishes without source".format(count)) \ No newline at end of file