Skip to content

Commit

Permalink
update crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
hiseen committed Feb 6, 2019
1 parent 4c74c27 commit 724013d
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 3 deletions.
14 changes: 11 additions & 3 deletions webCrawler/crawlerForEatingWell.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
lunch_file="D:\\lunch.json"
breakfast_file="D:\\breakfast.json"
other_file="D:\\other.json"
cache_filename="D:\\cache.tmp"
max_try=5
img_src_full_star="https://images.media-allrecipes.com/EatingWellAssets/assets/svg/icon/recipe-rating-stars/eating-well-star-full.svg"
img_null="http://images.media-allrecipes.com/images/82126.jpg"

def getDataFromPage(page,dinner,lunch,breakfast,other):
p=requests.get(page)
Expand Down Expand Up @@ -52,7 +52,15 @@ def getDataFromPage(page,dinner,lunch,breakfast,other):
author=temp.text.strip()
except:
print("no author! url = ",page)
author=None
author="EatingWell Kitchen"
temp=soup.find("div",{"class":"recipeDetailSummaryImageContainer"})
try:
img_url=temp.find("a")['href']
if img_url==img_null:
img_url=None
except:
print("image not found! url = ",page)
img_url=None
tags=[i.text.strip() for i in soup.find_all("span",{"class":"nutritionTag"})]
ingredients=[i.text.strip() for i in soup.find_all("span",{"itemprop":"ingredients"})]
try:
Expand Down Expand Up @@ -102,7 +110,7 @@ def getDataFromPage(page,dinner,lunch,breakfast,other):
avg_rating/=len(comments)
json_obj={"index":index,"title":title,"avg_rating":avg_rating,"comments":comments,\
"related":related,"tips":tips,"cals":cals,"total_time":total_time,"prep_time":prep_time,\
"servings":servings,"ingredients":ingredients,"tags":tags,"summary":summary,"author":author,"source":source,"instructions":instructions}
"servings":servings,"ingredients":ingredients,"tags":tags,"summary":summary,"author":author,"source":source,"instructions":instructions, "image_url":img_url}
small_index="".join(i.lower() for i in index)
if "dinner" in small_index:
dinner.append(json_obj)
Expand Down
35 changes: 35 additions & 0 deletions webCrawler/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import json
dinner_file="D:\\dinner.json"
lunch_file="D:\\lunch.json"
breakfast_file="D:\\breakfast.json"
other_file="D:\\other.json"
dinner=json.load(open(dinner_file))
breakfast=json.load(open(breakfast_file))
lunch=json.load(open(lunch_file))
other=json.load(open(other_file))
file_list=[breakfast,dinner,lunch,other]

def countTag(files):
tags=set()
for i in files:
for j in i:
for k in j['tags']:
tags.add(k)
for k in j['index']:
tags.add(k)
return tags

def saveFiles():
json.dump(dinner,open(dinner_file,'w'))
json.dump(breakfast,open(breakfast_file,'w'))
json.dump(lunch,open(lunch_file,'w'))
json.dump(other,open(other_file,'w'))

def updateSource(files):
count=0
for i in files:
for j in i:
if j['source']==None:
j['source']="EatingWell Kitchen"
count+=1
print("found {} dishes without source".format(count))

0 comments on commit 724013d

Please sign in to comment.