|
| 1 | +# frozen_string_literal: true |
| 2 | + |
| 3 | +# Based on https://github.com/ruby/prism/blob/main/rakelib/lex.rake |
| 4 | + |
| 5 | +module GemIndexing |
| 6 | + class << self |
| 7 | + # This method is responsible for iterating through a list of items and running |
| 8 | + # each item in a separate thread. It will block until all items have been |
| 9 | + # processed. This is particularly useful for tasks that are IO-bound like |
| 10 | + # downloading files or reading files from disk. |
| 11 | + def parallelize(items, &block) |
| 12 | + Thread.abort_on_exception = true |
| 13 | + |
| 14 | + queue = Queue.new |
| 15 | + items.each { |item| queue << item } |
| 16 | + |
| 17 | + workers = |
| 18 | + ENV.fetch("WORKERS") { 16 }.to_i.times.map do |
| 19 | + parallelize_thread(queue, &block) |
| 20 | + end |
| 21 | + |
| 22 | + workers.map(&:join) |
| 23 | + end |
| 24 | + |
| 25 | + private |
| 26 | + |
| 27 | + # Create a new thread with a minimal number of locals that it can access. |
| 28 | + def parallelize_thread(queue, &block) |
| 29 | + Thread.new { block.call(queue.shift) until queue.empty? } |
| 30 | + end |
| 31 | + end |
| 32 | +end |
| 33 | + |
| 34 | +TOP_100_GEM_FILENAME = "rakelib/top_100_gems.yml" |
| 35 | +TOP_100_GEMS_DIR = "tmp/top_100_gems" |
| 36 | + |
| 37 | +namespace :download do |
| 38 | + directory TOP_100_GEMS_DIR |
| 39 | + |
| 40 | + desc "Download the top 100 rubygems under #{TOP_100_GEMS_DIR}/" |
| 41 | + task topgems: TOP_100_GEMS_DIR do |
| 42 | + $LOAD_PATH.unshift(File.expand_path("../lib", __dir__)) |
| 43 | + require "net/http" |
| 44 | + require "rubygems/package" |
| 45 | + require "tmpdir" |
| 46 | + |
| 47 | + GemIndexing.parallelize(YAML.safe_load_file(TOP_100_GEM_FILENAME)) do |gem_name| |
| 48 | + directory = File.expand_path("#{TOP_100_GEMS_DIR}/#{gem_name}") |
| 49 | + next if File.directory?(directory) |
| 50 | + |
| 51 | + puts "Downloading #{gem_name}" |
| 52 | + |
| 53 | + uri = URI.parse("https://rubygems.org/gems/#{gem_name}.gem") |
| 54 | + response = Net::HTTP.get_response(uri) |
| 55 | + raise gem_name unless response.is_a?(Net::HTTPSuccess) |
| 56 | + |
| 57 | + Dir.mktmpdir do |tmpdir| |
| 58 | + filepath = File.join(tmpdir, "#{gem_name}.gem") |
| 59 | + File.write(filepath, response.body) |
| 60 | + Gem::Package.new(filepath).extract_files(directory, "**/*.rb") |
| 61 | + end |
| 62 | + end |
| 63 | + end |
| 64 | +end |
| 65 | + |
| 66 | +# This task indexes against the top 100 gems, and will exit(1) if any fail. |
| 67 | +desc "Index against the top 100 rubygems" |
| 68 | +task "index:topgems": ["download:topgems"] do |
| 69 | + $LOAD_PATH.unshift(File.expand_path("../lib", __dir__)) |
| 70 | + require "net/http" |
| 71 | + require "rubygems/package" |
| 72 | + require "tmpdir" |
| 73 | + |
| 74 | + gem_names = YAML.safe_load_file(TOP_100_GEM_FILENAME) |
| 75 | + |
| 76 | + errors = [] |
| 77 | + GemIndexing.parallelize(gem_names) do |gem_name| |
| 78 | + directory = File.expand_path("#{TOP_100_GEMS_DIR}/#{gem_name}") |
| 79 | + |
| 80 | + index = RubyIndexer::Index.new |
| 81 | + |
| 82 | + errors = Dir[File.join(directory, "**", "*.rb")].filter_map do |filepath| |
| 83 | + print(".") |
| 84 | + code = File.read(filepath) |
| 85 | + index.index_single(RubyIndexer::IndexablePath.new(nil, filepath), code) |
| 86 | + nil |
| 87 | + rescue => e |
| 88 | + errors << { message: e.message, file: filepath } |
| 89 | + end |
| 90 | + end |
| 91 | + |
| 92 | + puts "errors: #{errors}" if errors.any? |
| 93 | +ensure |
| 94 | + FileUtils.rm_rf(TOP_100_GEMS_DIR) |
| 95 | +end |
0 commit comments