-
Notifications
You must be signed in to change notification settings - Fork 27
Scraping URLs
George Richardson edited this page May 3, 2017
·
2 revisions
This assumes you are working in a Notebook.
import os
import sys
from sqlalchemy import create_engine
from sqlalchemy import exc
from sqlalchemy import func
from sqlalchemy import Table, text
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
sys.path.append(module_path)
from internal_displacement.model.model import Status, Session, Category, Article, Content, Country, CountryTerm, \
Location, Report, ReportDateSpan, ArticleCategory, Base
db_url = 'postgresql://{user}:{password}@{db_host}/{db}'.format(
user='jupyter', password='jupyter', db_host=db_host, db='id')
engine = create_engine(db_url)
Session.configure(bind=engine)
session = Session()
from internal_displacement.scraper import Scraper
from internal_displacement.interpreter import Interpreter
from internal_displacement.pipeline import Pipeline
import pandas as pd
scraper = Scraper()
interpreter = None
pipeline = Pipeline(session, scraper, interpreter)
try:
article = pipeline.create_article(url)
pipeline.fetch_article(article)
except exc.IntegrityError:
session.rollback()