|
1 |
| -require 'rubygems' |
2 | 1 | require 'json'
|
3 | 2 |
|
4 | 3 | module Jekyll
|
| 4 | + module LunrJsSearch |
| 5 | + class Indexer < Jekyll::Generator |
| 6 | + def initialize(config = {}) |
| 7 | + super(config) |
| 8 | + |
| 9 | + lunr_config = { |
| 10 | + 'excludes' => [], |
| 11 | + 'strip_index_html' => false, |
| 12 | + 'min_length' => 3, |
| 13 | + 'stopwords' => 'stopwords.txt' |
| 14 | + }.merge!(config['lunr_search'] || {}) |
| 15 | + |
| 16 | + @excludes = lunr_config['excludes'] |
| 17 | + |
| 18 | + # if web host supports index.html as default doc, then optionally exclude it from the url |
| 19 | + @strip_index_html = lunr_config['strip_index_html'] |
5 | 20 |
|
6 |
| - class Indexer < Generator |
7 |
| - |
8 |
| - def initialize(config = {}) |
9 |
| - super(config) |
10 |
| - |
11 |
| - lunr_config = { |
12 |
| - 'excludes' => [], |
13 |
| - 'strip_index_html' => false, |
14 |
| - 'min_length' => 3, |
15 |
| - 'stopwords' => 'stopwords.txt' |
16 |
| - }.merge!(config['lunr_search'] || {}) |
17 |
| - |
18 |
| - @excludes = lunr_config['excludes'] |
19 |
| - |
20 |
| - # if web host supports index.html as default doc, then optionally exclude it from the url |
21 |
| - @strip_index_html = lunr_config['strip_index_html'] |
22 |
| - |
23 |
| - # stop word exclusion configuration |
24 |
| - @min_length = lunr_config['min_length'] |
25 |
| - @stopwords_file = lunr_config['stopwords'] |
26 |
| - end |
27 |
| - |
28 |
| - # Index all pages except pages matching any value in config['lunr_excludes'] or with date['exclude_from_search'] |
29 |
| - # The main content from each page is extracted and saved to disk as json |
30 |
| - def generate(site) |
31 |
| - puts 'Running the search indexer...' |
32 |
| - |
33 |
| - # gather pages and posts |
34 |
| - items = pages_to_index(site) |
35 |
| - content_renderer = PageRenderer.new(site) |
36 |
| - index = [] |
37 |
| - |
38 |
| - items.each do |item| |
39 |
| - entry = SearchEntry.create(item, content_renderer) |
| 21 | + # stop word exclusion configuration |
| 22 | + @min_length = lunr_config['min_length'] |
| 23 | + @stopwords_file = lunr_config['stopwords'] |
| 24 | + end |
40 | 25 |
|
41 |
| - entry.strip_index_suffix_from_url! if @strip_index_html |
42 |
| - entry.strip_stopwords!(stopwords, @min_length) if File.exists?(@stopwords_file) |
| 26 | + # Index all pages except pages matching any value in config['lunr_excludes'] or with date['exclude_from_search'] |
| 27 | + # The main content from each page is extracted and saved to disk as json |
| 28 | + def generate(site) |
| 29 | + puts 'Running the search indexer...' |
| 30 | + |
| 31 | + # gather pages and posts |
| 32 | + items = pages_to_index(site) |
| 33 | + content_renderer = PageRenderer.new(site) |
| 34 | + index = [] |
| 35 | + |
| 36 | + items.each do |item| |
| 37 | + entry = SearchEntry.create(item, content_renderer) |
| 38 | + |
| 39 | + entry.strip_index_suffix_from_url! if @strip_index_html |
| 40 | + entry.strip_stopwords!(stopwords, @min_length) if File.exists?(@stopwords_file) |
| 41 | + |
| 42 | + index << { |
| 43 | + :title => entry.title, |
| 44 | + :url => entry.url, |
| 45 | + :date => entry.date, |
| 46 | + :categories => entry.categories, |
| 47 | + :body => entry.body |
| 48 | + } |
| 49 | + |
| 50 | + puts 'Indexed ' << "#{entry.title} (#{entry.url})" |
| 51 | + end |
| 52 | + |
| 53 | + json = JSON.generate({:entries => index}) |
43 | 54 |
|
44 |
| - index << { |
45 |
| - :title => entry.title, |
46 |
| - :url => entry.url, |
47 |
| - :date => entry.date, |
48 |
| - :categories => entry.categories, |
49 |
| - :body => entry.body |
50 |
| - } |
| 55 | + # Create destination directory if it doesn't exist yet. Otherwise, we cannot write our file there. |
| 56 | + Dir::mkdir(site.dest) unless File.directory?(site.dest) |
51 | 57 |
|
52 |
| - puts 'Indexed ' << "#{entry.title} (#{entry.url})" |
| 58 | + # File I/O: create search.json file and write out pretty-printed JSON |
| 59 | + filename = 'search.json' |
| 60 | + |
| 61 | + File.open(File.join(site.dest, filename), "w") do |file| |
| 62 | + file.write(json) |
| 63 | + end |
| 64 | + |
| 65 | + # Keep the search.json file from being cleaned by Jekyll |
| 66 | + site.static_files << SearchIndexFile.new(site, site.dest, "/", filename) |
53 | 67 | end
|
| 68 | + |
| 69 | + private |
54 | 70 |
|
55 |
| - json = JSON.generate({:entries => index}) |
56 |
| - |
57 |
| - # Create destination directory if it doesn't exist yet. Otherwise, we cannot write our file there. |
58 |
| - Dir::mkdir(site.dest) unless File.directory?(site.dest) |
59 |
| - |
60 |
| - # File I/O: create search.json file and write out pretty-printed JSON |
61 |
| - filename = 'search.json' |
62 |
| - |
63 |
| - File.open(File.join(site.dest, filename), "w") do |file| |
64 |
| - file.write(json) |
| 71 | + # load the stopwords file |
| 72 | + def stopwords |
| 73 | + @stopwords ||= IO.readlines(@stopwords_file).map { |l| l.strip } |
65 | 74 | end
|
66 |
| - |
67 |
| - # Keep the search.json file from being cleaned by Jekyll |
68 |
| - site.static_files << Jekyll::SearchIndexFile.new(site, site.dest, "/", filename) |
69 |
| - end |
70 |
| - |
71 |
| - private |
72 |
| - |
73 |
| - # load the stopwords file |
74 |
| - def stopwords |
75 |
| - @stopwords ||= IO.readlines(@stopwords_file).map { |l| l.strip } |
76 |
| - end |
77 |
| - |
78 |
| - def pages_to_index(site) |
79 |
| - items = [] |
80 | 75 |
|
81 |
| - # deep copy pages |
82 |
| - site.pages.each {|page| items << page.dup } |
83 |
| - site.posts.each {|post| items << post.dup } |
| 76 | + def pages_to_index(site) |
| 77 | + items = [] |
| 78 | + |
| 79 | + # deep copy pages |
| 80 | + site.pages.each {|page| items << page.dup } |
| 81 | + site.posts.each {|post| items << post.dup } |
84 | 82 |
|
85 |
| - # only process files that will be converted to .html and only non excluded files |
86 |
| - items.select! {|i| i.output_ext == '.html' && ! @excludes.any? {|s| (i.url =~ Regexp.new(s)) != nil } } |
87 |
| - items.reject! {|i| i.data['exclude_from_search'] } |
88 |
| - |
89 |
| - items |
| 83 | + # only process files that will be converted to .html and only non excluded files |
| 84 | + items.select! {|i| i.output_ext == '.html' && ! @excludes.any? {|s| (i.url =~ Regexp.new(s)) != nil } } |
| 85 | + items.reject! {|i| i.data['exclude_from_search'] } |
| 86 | + |
| 87 | + items |
| 88 | + end |
90 | 89 | end
|
91 | 90 | end
|
92 | 91 | end
|
93 | 92 | require 'nokogiri'
|
94 | 93 |
|
95 | 94 | module Jekyll
|
96 |
| - |
97 |
| - class PageRenderer |
98 |
| - def initialize(site) |
99 |
| - @site = site |
100 |
| - end |
101 |
| - |
102 |
| - # render the item, parse the output and get all text inside <p> elements |
103 |
| - def render(item) |
104 |
| - item.render({}, @site.site_payload) |
105 |
| - doc = Nokogiri::HTML(item.output) |
106 |
| - paragraphs = doc.search('//text()').map {|t| t.content } |
107 |
| - paragraphs = paragraphs.join(" ").gsub("\r", " ").gsub("\n", " ").gsub("\t", " ").gsub(/\s+/, " ") |
| 95 | + module LunrJsSearch |
| 96 | + class PageRenderer |
| 97 | + def initialize(site) |
| 98 | + @site = site |
| 99 | + end |
| 100 | + |
| 101 | + # render the item, parse the output and get all text inside <p> elements |
| 102 | + def render(item) |
| 103 | + item.render({}, @site.site_payload) |
| 104 | + doc = Nokogiri::HTML(item.output) |
| 105 | + paragraphs = doc.search('//text()').map {|t| t.content } |
| 106 | + paragraphs = paragraphs.join(" ").gsub("\r", " ").gsub("\n", " ").gsub("\t", " ").gsub(/\s+/, " ") |
| 107 | + end |
108 | 108 | end
|
109 |
| - end |
110 |
| - |
| 109 | + end |
111 | 110 | end
|
112 | 111 | require 'nokogiri'
|
113 | 112 |
|
114 | 113 | module Jekyll
|
115 |
| - |
116 |
| - class SearchEntry |
117 |
| - def self.create(page_or_post, renderer) |
118 |
| - return create_from_post(page_or_post, renderer) if page_or_post.is_a?(Jekyll::Post) |
119 |
| - return create_from_page(page_or_post, renderer) if page_or_post.is_a?(Jekyll::Page) |
120 |
| - raise 'Not supported' |
121 |
| - end |
122 |
| - |
123 |
| - def self.create_from_page(page, renderer) |
124 |
| - title, url = extract_title_and_url(page) |
125 |
| - body = renderer.render(page) |
126 |
| - date = nil |
127 |
| - categories = [] |
| 114 | + module LunrJsSearch |
| 115 | + class SearchEntry |
| 116 | + def self.create(page_or_post, renderer) |
| 117 | + return create_from_post(page_or_post, renderer) if page_or_post.is_a?(Jekyll::Post) |
| 118 | + return create_from_page(page_or_post, renderer) if page_or_post.is_a?(Jekyll::Page) |
| 119 | + raise 'Not supported' |
| 120 | + end |
128 | 121 |
|
129 |
| - SearchEntry.new(title, url, date, categories, body) |
130 |
| - end |
131 |
| - |
132 |
| - def self.create_from_post(post, renderer) |
133 |
| - title, url = extract_title_and_url(post) |
134 |
| - body = renderer.render(post) |
135 |
| - date = post.date |
136 |
| - categories = post.categories |
| 122 | + def self.create_from_page(page, renderer) |
| 123 | + title, url = extract_title_and_url(page) |
| 124 | + body = renderer.render(page) |
| 125 | + date = nil |
| 126 | + categories = [] |
| 127 | + |
| 128 | + SearchEntry.new(title, url, date, categories, body) |
| 129 | + end |
137 | 130 |
|
138 |
| - SearchEntry.new(title, url, date, categories, body) |
139 |
| - end |
| 131 | + def self.create_from_post(post, renderer) |
| 132 | + title, url = extract_title_and_url(post) |
| 133 | + body = renderer.render(post) |
| 134 | + date = post.date |
| 135 | + categories = post.categories |
| 136 | + |
| 137 | + SearchEntry.new(title, url, date, categories, body) |
| 138 | + end |
140 | 139 |
|
141 |
| - def self.extract_title_and_url(item) |
142 |
| - data = item.to_liquid |
143 |
| - [ data['title'], data['url'] ] |
144 |
| - end |
| 140 | + def self.extract_title_and_url(item) |
| 141 | + data = item.to_liquid |
| 142 | + [ data['title'], data['url'] ] |
| 143 | + end |
145 | 144 |
|
146 |
| - attr_reader :title, :url, :date, :categories, :body |
147 |
| - |
148 |
| - def initialize(title, url, date, categories, body) |
149 |
| - @title, @url, @date, @categories, @body = title, url, date, categories, body |
150 |
| - end |
151 |
| - |
152 |
| - def strip_index_suffix_from_url! |
153 |
| - @url.gsub!(/index\.html$/, '') |
| 145 | + attr_reader :title, :url, :date, :categories, :body |
| 146 | + |
| 147 | + def initialize(title, url, date, categories, body) |
| 148 | + @title, @url, @date, @categories, @body = title, url, date, categories, body |
| 149 | + end |
| 150 | + |
| 151 | + def strip_index_suffix_from_url! |
| 152 | + @url.gsub!(/index\.html$/, '') |
| 153 | + end |
| 154 | + |
| 155 | + # remove anything that is in the stop words list from the text to be indexed |
| 156 | + def strip_stopwords!(stopwords, min_length) |
| 157 | + @body = @body.split.delete_if() do |x| |
| 158 | + t = x.downcase.gsub(/[^a-z]/, '') |
| 159 | + t.length < min_length || stopwords.include?(t) |
| 160 | + end.join(' ') |
| 161 | + end |
154 | 162 | end
|
155 |
| - |
156 |
| - # remove anything that is in the stop words list from the text to be indexed |
157 |
| - def strip_stopwords!(stopwords, min_length) |
158 |
| - @body = @body.split.delete_if() do |x| |
159 |
| - t = x.downcase.gsub(/[^a-z]/, '') |
160 |
| - t.length < min_length || stopwords.include?(t) |
161 |
| - end.join(' ') |
162 |
| - end |
163 | 163 | end
|
164 | 164 | end
|
165 | 165 | module Jekyll
|
166 |
| - |
167 |
| - class SearchIndexFile < StaticFile |
168 |
| - # Override write as the search.json index file has already been created |
169 |
| - def write(dest) |
170 |
| - true |
171 |
| - end |
| 166 | + module LunrJsSearch |
| 167 | + class SearchIndexFile < Jekyll::StaticFile |
| 168 | + # Override write as the search.json index file has already been created |
| 169 | + def write(dest) |
| 170 | + true |
| 171 | + end |
| 172 | + end |
172 | 173 | end
|
173 |
| - |
| 174 | +end |
| 175 | +module Jekyll |
| 176 | + module LunrJsSearch |
| 177 | + VERSION = "0.1.1" |
| 178 | + end |
174 | 179 | end
|
0 commit comments