From 7c86f2270442c6ffb59629655db1c52a66ea149d Mon Sep 17 00:00:00 2001 From: marcosf63 Date: Mon, 4 Jan 2016 23:13:22 -0300 Subject: [PATCH] Usando BeautifulSoap --- webscraping/a.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/webscraping/a.py b/webscraping/a.py index 68e7011..46f7f26 100644 --- a/webscraping/a.py +++ b/webscraping/a.py @@ -2,12 +2,27 @@ # -*- coding: utf-8 -*- import urllib2 +from bs4 import BeautifulSoup +import sys -t = urllib2.urlopen('http://www.gmasson.com.br/').read() +def pega_link(): + """ + Extrai os links de uma página + """ + try: + html_doc = urllib2.urlopen(sys.argv[1]).read() -# TAG -tags = t.split('')[0].split('>',1)[1] for tag in tags ] + soup = BeautifulSoup(html_doc, 'html.parser') -for i in tags: - print i + links = soup.find_all('a') + + for link in links: + print link.get('href') + + except IndexError: + print "Informe um parâmetro (site alvo)" + except ValueError: + print "Informe um parâmetro no formato http://www.site.com" + +if __name__ == '__main__': + pega_link()