|
| 1 | +# The program will use urllib to read the HTML from the data files below, extract |
| 2 | +# the href= vaues from the anchor tags, scan for a tag that is in a particular |
| 3 | +# position relative to the first name in the list, follow that link and repeat |
| 4 | +# the process a number of times and report the last name you find. |
| 5 | + |
| 6 | +import urllib.request,urllib.parse,urllib.error |
| 7 | +from bs4 import BeautifulSoup |
| 8 | +import ssl |
| 9 | +import re |
| 10 | + |
| 11 | +# Ignore SSL certificate errors |
| 12 | +ctx = ssl.create_default_context() |
| 13 | +ctx.check_hostname = False |
| 14 | +ctx.verify_mode = ssl.CERT_NONE |
| 15 | +loop_count=1 |
| 16 | +url=input('Enter URL: ') |
| 17 | +# below is to allow for testing using default url |
| 18 | +if len(url)<1: |
| 19 | + url='http://py4e-data.dr-chuck.net/known_by_Fikret.html' |
| 20 | + count=4 |
| 21 | + position=3 |
| 22 | +else: |
| 23 | + count=input('Enter Count: ') |
| 24 | + position=input('Enter Position: ') |
| 25 | +while True: |
| 26 | + html=urllib.request.urlopen(url,context=ctx).read() |
| 27 | + parsed_html_File=BeautifulSoup(html,'html.parser') |
| 28 | + |
| 29 | + linkL=list() |
| 30 | + anchor_tags=parsed_html_File('a') |
| 31 | + for tag in anchor_tags: |
| 32 | + link=tag.get('href',None) |
| 33 | + linkL.append(link) |
| 34 | + url=linkL[(int(position)-1)] |
| 35 | + loop_count=loop_count+1 |
| 36 | + if loop_count>int(count): |
| 37 | + break |
| 38 | +print(url) |
| 39 | +print("Name of the person: ",re.findall('by_(.*).html',url)[0]) |
0 commit comments