Skip to content

Scraping URLs

George Richardson edited this page May 3, 2017 · 2 revisions

This assumes you are working in a Notebook.

1. Access the database and start a session:

import os
import sys
from sqlalchemy import create_engine
from sqlalchemy import exc
from sqlalchemy import func
from sqlalchemy import Table, text
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from internal_displacement.model.model import Status, Session, Category, Article, Content, Country, CountryTerm, \
    Location, Report, ReportDateSpan, ArticleCategory, Base

db_url = 'postgresql://{user}:{password}@{db_host}/{db}'.format(
        user='jupyter', password='jupyter', db_host=db_host, db='id')

engine = create_engine(db_url)
Session.configure(bind=engine)
session = Session()

2. Import the relevant modules:

from internal_displacement.scraper import Scraper
from internal_displacement.interpreter import Interpreter
from internal_displacement.pipeline import Pipeline
import pandas as pd

3. Initialize the Scraper, set Interpreter to None and initialize the Pipeline:

scraper = Scraper()
interpreter = None
pipeline = Pipeline(session, scraper, interpreter)

4. Process a single url:

try:
    article = pipeline.create_article(url)
    pipeline.fetch_article(article)
except exc.IntegrityError:
    session.rollback()