Skip to content

Commit

Permalink
fix postal code downloader/parser
Browse files Browse the repository at this point in the history
  • Loading branch information
mithereal committed Mar 3, 2025
1 parent 6d0a928 commit fd0506c
Show file tree
Hide file tree
Showing 8 changed files with 157 additions and 36 deletions.
1 change: 1 addition & 0 deletions lib/location.ex
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
NimbleCSV.define(PostCodeCSV, separator: ",", escape: "\~")
NimbleCSV.define(LocationCSV, separator: "\t", escape: "\~")

defmodule Location do
Expand Down
170 changes: 142 additions & 28 deletions lib/location/postalcode.ex
Original file line number Diff line number Diff line change
Expand Up @@ -23,40 +23,26 @@ defmodule Location.PostalCode do
{:decentralized_counters, false}
])

@ets_table_by_id =
:ets.new(@ets_table_by_id, [
:set,
:named_table,
:public,
:compressed,
{:write_concurrency, true},
{:read_concurrency, true},
{:decentralized_counters, false}
])

source_file()
|> File.stream!()
|> Stream.chunk_every(15_000)
|> Task.async_stream(
fn chunk ->
chunk
|> LocationCSV.parse_stream()
|> Stream.each(fn [
country_code,
postal_code,
city_name,
_state_name,
state_code,
_municipality,
_municipality_code,
_admin_name3,
_admin_code3,
latitude,
longitude,
_accuracy
] ->
country_code = String.trim(country_code)

true =
:ets.insert(
@ets_table_by_lookup,
{{country_code, state_code, city_name}, {postal_code, latitude, longitude}}
)

true =
:ets.insert(
@ets_table_by_id,
{postal_code, {country_code, state_code, city_name, latitude, longitude}}
)
|> PostCodeCSV.parse_stream()
|> Stream.each(fn data ->
__MODULE__.parse(data)
end)
|> Stream.run()
end,
Expand All @@ -65,6 +51,125 @@ defmodule Location.PostalCode do
|> Stream.run()
end

def parse(data) do
case data do
[
country_code,
postal_code,
city_name,
_state_name,
state_code,
_municipality,
_municipality_code,
_admin_name3,
_admin_code3,
latitude,
longitude,
_accuracy,
_,
_
] ->
country_code = String.trim(country_code)

true =
:ets.insert(
@ets_table_by_lookup,
{{country_code, state_code, city_name}, {postal_code, latitude, longitude}}
)

true =
:ets.insert(
@ets_table_by_id,
{postal_code, {country_code, state_code, city_name, latitude, longitude}}
)

[
country_code,
postal_code,
city_name,
_state_name,
state_code,
_municipality,
_municipality_code,
_admin_name3,
_admin_code3,
latitude,
longitude,
_accuracy,
_
] ->
country_code = String.trim(country_code)

true =
:ets.insert(
@ets_table_by_lookup,
{{country_code, state_code, city_name}, {postal_code, latitude, longitude}}
)

true =
:ets.insert(
@ets_table_by_id,
{postal_code, {country_code, state_code, city_name, latitude, longitude}}
)

[
country_code,
postal_code,
city_name,
_state_name,
state_code,
_municipality,
_municipality_code,
_admin_name3,
_admin_code3,
latitude,
longitude,
_accuracy
] ->
country_code = String.trim(country_code)

true =
:ets.insert(
@ets_table_by_lookup,
{{country_code, state_code, city_name}, {postal_code, latitude, longitude}}
)

true =
:ets.insert(
@ets_table_by_id,
{postal_code, {country_code, state_code, city_name, latitude, longitude}}
)

[
country_code,
postal_code,
city_name,
_state_name,
state_code,
_municipality,
_municipality_code,
_admin_name3,
_admin_code3,
latitude,
longitude
] ->
true =
:ets.insert(
@ets_table_by_lookup,
{{country_code, state_code, city_name}, {postal_code, latitude, longitude}}
)

true =
:ets.insert(
@ets_table_by_id,
{postal_code, {country_code, state_code, city_name, latitude, longitude}}
)

_data ->
:ok
end
end

@doc """
Finds postal_code information by postal code.
"""
Expand Down Expand Up @@ -99,6 +204,15 @@ defmodule Location.PostalCode do
end
end

@spec get_postal_codes() :: %__MODULE__{} | nil
def get_postal_codes() do
:ets.tab2list(@ets_table_by_id)
|> Enum.map(fn x ->
{{country_code, state_code, city_name}, {postal_code, latitude, longitude}} = x
to_struct(postal_code, country_code, state_code, city_name, latitude, longitude)
end)
end

defp source_file() do
default = Application.app_dir(:location, "/priv/postal_codes.csv")
Application.get_env(:location, :postal_codes_source_file, default)
Expand Down
1 change: 1 addition & 0 deletions lib/mix/tasks/update_geoname_data.ex
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ defmodule Mix.Tasks.Location.UpdateGeonameData do
System.cmd("wget", [@allcountries_src, "-O", "/tmp/allCountries.zip"])
zip_file = Unzip.LocalFile.open("/tmp/allCountries.zip")
{:ok, unzip} = Unzip.new(zip_file)

Unzip.file_stream!(unzip, "allCountries.txt")
|> Stream.into(File.stream!("/tmp/allCountries.txt"))
|> Stream.run()
Expand Down
16 changes: 9 additions & 7 deletions lib/mix/tasks/update_postal_code_data.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ defmodule Mix.Tasks.Location.UpdatePostalCodeData do
use Mix.Task
@shortdoc "Updates the postal code data from source"

@destination_filename Application.compile_env(:location, :postal_codes_source_file, "priv/postal_codes.csv")
@destination_filename Application.compile_env(
:location,
:postal_codes_source_file,
"priv/postal_codes.csv"
)

@doc """
The data source clocks in at 16mb. Expect this to take a while.
Expand Down Expand Up @@ -46,6 +50,7 @@ defmodule Mix.Tasks.Location.UpdatePostalCodeData do

zip_file = Unzip.LocalFile.open("/tmp/#{name}.zip")
{:ok, unzip} = Unzip.new(zip_file)

Unzip.file_stream!(unzip, "#{name}.txt")
|> Stream.into(File.stream!("/tmp/#{name}.txt"))
|> Stream.run()
Expand All @@ -61,18 +66,15 @@ defmodule Mix.Tasks.Location.UpdatePostalCodeData do
filename
|> File.stream!(read_ahead: 100_000)
|> Flow.from_enumerable()
|> Flow.map(&String.split(&1, tab))
|> Flow.map(&(String.trim(&1) |> String.split(tab)))
|> Flow.partition()
|> Enum.into([])

IO.puts("Writing result to #{@destination_filename}")

Location.Scraper.write_date_to_version()

case append do
false -> File.write!(@destination_filename, Enum.join(result, "\n"))
true -> File.write!(@destination_filename, Enum.join(result, "\n"), :append)
end

file = File.open!(@destination_filename, [:write, :utf8])
result |> CSV.encode() |> Enum.each(&IO.write(file, &1))
end
end
1 change: 1 addition & 0 deletions location.iml
Original file line number Diff line number Diff line change
Expand Up @@ -77,5 +77,6 @@
<orderEntry type="library" name="unzip" level="project" />
<orderEntry type="library" name="benchee_html" level="project" />
<orderEntry type="library" name="mox" level="project" />
<orderEntry type="library" name="csv" level="project" />
</component>
</module>
3 changes: 2 additions & 1 deletion mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ defmodule Location.MixProject do
{:tesla, "~> 1.8"},
{:hackney, "~> 1.20"},
{:flow, "~> 1.2"},
{:unzip, "0.11.0"}
{:unzip, "0.11.0"},
{:csv, "~> 3.2"}
]
end
end
1 change: 1 addition & 0 deletions mix.lock
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
%{
"certifi": {:hex, :certifi, "2.14.0", "ed3bef654e69cde5e6c022df8070a579a79e8ba2368a00acf3d75b82d9aceeed", [:rebar3], [], "hexpm", "ea59d87ef89da429b8e905264fdec3419f84f2215bb3d81e07a18aac919026c3"},
"csv": {:hex, :csv, "3.2.2", "452f96414b39a176b7c390af6d8b78f15130dc6167fe3b836729131f515d843e", [:mix], [], "hexpm", "cbf256ff74a3fa01d9ec420d07b19c90d410ed9fe5b6d6e1bc7662edf35bc574"},
"floki": {:hex, :floki, "0.35.4", "cc947b446024732c07274ac656600c5c4dc014caa1f8fb2dfff93d275b83890d", [:mix], [], "hexpm", "27fa185d3469bd8fc5947ef0f8d5c4e47f0af02eb6b070b63c868f69e3af0204"},
"flow": {:hex, :flow, "1.2.4", "1dd58918287eb286656008777cb32714b5123d3855956f29aa141ebae456922d", [:mix], [{:gen_stage, "~> 1.0", [hex: :gen_stage, repo: "hexpm", optional: false]}], "hexpm", "874adde96368e71870f3510b91e35bc31652291858c86c0e75359cbdd35eb211"},
"gen_stage": {:hex, :gen_stage, "1.2.1", "19d8b5e9a5996d813b8245338a28246307fd8b9c99d1237de199d21efc4c76a1", [:mix], [], "hexpm", "83e8be657fa05b992ffa6ac1e3af6d57aa50aace8f691fcf696ff02f8335b001"},
Expand Down
Empty file removed priv/postal_codes.csv
Empty file.

0 comments on commit fd0506c

Please sign in to comment.