diff --git a/feedbag.gemspec b/feedbag.gemspec index 3a9fa2c..4193201 100644 --- a/feedbag.gemspec +++ b/feedbag.gemspec @@ -1,11 +1,11 @@ # -*- encoding: utf-8 -*- - + Gem::Specification.new do |s| s.name = %q{feedbag} s.version = "0.9.2" s.homepage = "http://github.com/damog/feedbag" s.rubyforge_project = "feedbag" - + s.authors = ["David Moreno", "Derek Willis"] s.date = %q{2013-12-07} s.description = %q{Ruby's favorite feed auto-discoverty tool} diff --git a/lib/feedbag.rb b/lib/feedbag.rb index 030306c..ce626e2 100644 --- a/lib/feedbag.rb +++ b/lib/feedbag.rb @@ -28,14 +28,14 @@ class Feedbag - CONTENT_TYPES = [ - 'application/x.atom+xml', - 'application/atom+xml', - 'application/xml', - 'text/xml', - 'application/rss+xml', - 'application/rdf+xml', - ].freeze + CONTENT_TYPES = [ + 'application/x.atom+xml', + 'application/atom+xml', + 'application/xml', + 'text/xml', + 'application/rss+xml', + 'application/rdf+xml', + ].freeze def self.feed?(url) new.feed?(url) @@ -49,160 +49,160 @@ def initialize @feeds = [] end - def feed?(url) - # use LWR::Simple.normalize some time - url_uri = URI.parse(url) - url = "#{url_uri.scheme or 'http'}://#{url_uri.host}#{url_uri.path}" - url << "?#{url_uri.query}" if url_uri.query - - # hack: - url.sub!(/^feed:\/\//, 'http://') - - res = Feedbag.find(url) - if res.size == 1 and res.first == url - return true - else - return false - end - end - - def find(url, args = {}) - url_uri = URI.parse(url) - url = nil - if url_uri.scheme.nil? - url = "http://#{url_uri.to_s}" - elsif url_uri.scheme == "feed" - return self.add_feed(url_uri.to_s.sub(/^feed:\/\//, 'http://'), nil) - else - url = url_uri.to_s - end - #url = "#{url_uri.scheme or 'http'}://#{url_uri.host}#{url_uri.path}" - - # check if feed_valid is avail + def feed?(url) + # use LWR::Simple.normalize some time + url_uri = URI.parse(url) + url = "#{url_uri.scheme or 'http'}://#{url_uri.host}#{url_uri.path}" + url << "?#{url_uri.query}" if url_uri.query + + # hack: + url.sub!(/^feed:\/\//, 'http://') + + res = Feedbag.find(url) + if res.size == 1 and res.first == url + return true + else + return false + end + end + + def find(url, args = {}) + url_uri = URI.parse(url) + url = nil + if url_uri.scheme.nil? + url = "http://#{url_uri.to_s}" + elsif url_uri.scheme == "feed" + return self.add_feed(url_uri.to_s.sub(/^feed:\/\//, 'http://'), nil) + else + url = url_uri.to_s + end + #url = "#{url_uri.scheme or 'http'}://#{url_uri.host}#{url_uri.path}" + + # check if feed_valid is avail begin - require "feed_validator" - v = W3C::FeedValidator.new - v.validate_url(url) - return self.add_feed(url, nil) if v.valid? - rescue LoadError - # scoo - rescue REXML::ParseException - # usually indicates timeout - # TODO: actually find out timeout. use Terminator? - # $stderr.puts "Feed looked like feed but might not have passed validation or timed out" + require "feed_validator" + v = W3C::FeedValidator.new + v.validate_url(url) + return self.add_feed(url, nil) if v.valid? + rescue LoadError + # scoo + rescue REXML::ParseException + # usually indicates timeout + # TODO: actually find out timeout. use Terminator? + # $stderr.puts "Feed looked like feed but might not have passed validation or timed out" rescue => ex - $stderr.puts "#{ex.class} error occurred with: `#{url}': #{ex.message}" - end - - begin - html = open(url) do |f| - content_type = f.content_type.downcase - if content_type == "application/octet-stream" # open failed - content_type = f.meta["content-type"].gsub(/;.*$/, '') - end - if CONTENT_TYPES.include?(content_type) - return self.add_feed(url, nil) - end - - doc = Nokogiri::HTML(f.read) - - if doc.at("base") and doc.at("base")["href"] - @base_uri = doc.at("base")["href"] - else - @base_uri = nil - end - - # first with links + $stderr.puts "#{ex.class} error occurred with: `#{url}': #{ex.message}" + end + + begin + html = open(url) do |f| + content_type = f.content_type.downcase + if content_type == "application/octet-stream" # open failed + content_type = f.meta["content-type"].gsub(/;.*$/, '') + end + if CONTENT_TYPES.include?(content_type) + return self.add_feed(url, nil) + end + + doc = Nokogiri::HTML(f.read) + + if doc.at("base") and doc.at("base")["href"] + @base_uri = doc.at("base")["href"] + else + @base_uri = nil + end + + # first with links (doc/"atom:link").each do |l| - next unless l["rel"] - if l["type"] and CONTENT_TYPES.include?(l["type"].downcase.strip) and l["rel"].downcase == "self" - self.add_feed(l["href"], url, @base_uri) - end - end - - (doc/"link").each do |l| - next unless l["rel"] - if l["type"] and CONTENT_TYPES.include?(l["type"].downcase.strip) and (l["rel"].downcase =~ /alternate/i or l["rel"] == "service.feed") - self.add_feed(l["href"], url, @base_uri) - end - end - - (doc/"a").each do |a| - next unless a["href"] - if self.looks_like_feed?(a["href"]) and (a["href"] =~ /\// or a["href"] =~ /#{url_uri.host}/) - self.add_feed(a["href"], url, @base_uri) - end - end - - (doc/"a").each do |a| - next unless a["href"] - if self.looks_like_feed?(a["href"]) - self.add_feed(a["href"], url, @base_uri) - end - end + next unless l["rel"] + if l["type"] and CONTENT_TYPES.include?(l["type"].downcase.strip) and l["rel"].downcase == "self" + self.add_feed(l["href"], url, @base_uri) + end + end + + (doc/"link").each do |l| + next unless l["rel"] + if l["type"] and CONTENT_TYPES.include?(l["type"].downcase.strip) and (l["rel"].downcase =~ /alternate/i or l["rel"] == "service.feed") + self.add_feed(l["href"], url, @base_uri) + end + end + + (doc/"a").each do |a| + next unless a["href"] + if self.looks_like_feed?(a["href"]) and (a["href"] =~ /\// or a["href"] =~ /#{url_uri.host}/) + self.add_feed(a["href"], url, @base_uri) + end + end + + (doc/"a").each do |a| + next unless a["href"] + if self.looks_like_feed?(a["href"]) + self.add_feed(a["href"], url, @base_uri) + end + end # Added support for feeds like http://tabtimes.com/tbfeed/mashable/full.xml if url.match(/.xml$/) and doc.root and doc.root["xml:base"] and doc.root["xml:base"].strip == url.strip - self.add_feed(url, nil) + self.add_feed(url, nil) end - end - rescue Timeout::Error => err - $stderr.puts "Timeout error occurred with `#{url}: #{err}'" - rescue OpenURI::HTTPError => the_error - $stderr.puts "Error occurred with `#{url}': #{the_error}" - rescue SocketError => err - $stderr.puts "Socket error occurred with: `#{url}': #{err}" - rescue => ex - $stderr.puts "#{ex.class} error occurred with: `#{url}': #{ex.message}" - ensure - return @feeds - end - - end - - def looks_like_feed?(url) - if url =~ /(\.(rdf|xml|rdf|rss)$|feed=(rss|atom)|(atom|feed)\/?$)/i - true - else - false - end - end - - def add_feed(feed_url, orig_url, base_uri = nil) - # puts "#{feed_url} - #{orig_url}" - url = feed_url.sub(/^feed:/, '').strip - - if base_uri - # url = base_uri + feed_url - url = URI.parse(base_uri).merge(feed_url).to_s - end - - begin - uri = URI.parse(url) - rescue - puts "Error with `#{url}'" - exit 1 - end - unless uri.absolute? - orig = URI.parse(orig_url) - url = orig.merge(url).to_s - end - - # verify url is really valid - @feeds.push(url) unless @feeds.include?(url)# if self._is_http_valid(URI.parse(url), orig_url) - end - - # not used. yet. - def _is_http_valid(uri, orig_url) - req = Net::HTTP.get_response(uri) - orig_uri = URI.parse(orig_url) - case req - when Net::HTTPSuccess then - return true - else - return false - end - end + end + rescue Timeout::Error => err + $stderr.puts "Timeout error occurred with `#{url}: #{err}'" + rescue OpenURI::HTTPError => the_error + $stderr.puts "Error occurred with `#{url}': #{the_error}" + rescue SocketError => err + $stderr.puts "Socket error occurred with: `#{url}': #{err}" + rescue => ex + $stderr.puts "#{ex.class} error occurred with: `#{url}': #{ex.message}" + ensure + return @feeds + end + + end + + def looks_like_feed?(url) + if url =~ /(\.(rdf|xml|rdf|rss)$|feed=(rss|atom)|(atom|feed)\/?$)/i + true + else + false + end + end + + def add_feed(feed_url, orig_url, base_uri = nil) + # puts "#{feed_url} - #{orig_url}" + url = feed_url.sub(/^feed:/, '').strip + + if base_uri + # url = base_uri + feed_url + url = URI.parse(base_uri).merge(feed_url).to_s + end + + begin + uri = URI.parse(url) + rescue + puts "Error with `#{url}'" + exit 1 + end + unless uri.absolute? + orig = URI.parse(orig_url) + url = orig.merge(url).to_s + end + + # verify url is really valid + @feeds.push(url) unless @feeds.include?(url)# if self._is_http_valid(URI.parse(url), orig_url) + end + + # not used. yet. + def _is_http_valid(uri, orig_url) + req = Net::HTTP.get_response(uri) + orig_uri = URI.parse(orig_url) + case req + when Net::HTTPSuccess then + return true + else + return false + end + end end if __FILE__ == $0 diff --git a/test/feedbag_test.rb b/test/feedbag_test.rb index acd409f..6a8486a 100644 --- a/test/feedbag_test.rb +++ b/test/feedbag_test.rb @@ -1,7 +1,7 @@ require 'test_helper' class FeedbagTest < Test::Unit::TestCase - + context "Feedbag.feed? should know that an RSS url is a feed" do setup do @rss_url = 'http://example.com/rss/' @@ -11,7 +11,7 @@ class FeedbagTest < Test::Unit::TestCase assert Feedbag.feed?(@rss_url) end end - + context "Feedbag.feed? should know that an RSS url with parameters is a feed" do setup do @rss_url = "http://example.com/data?format=rss"