diff --git a/Project.toml b/Project.toml index 9f06c45..cce3b32 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ Cbc = "9961bab8-2fa3-5c5a-9d89-47fab24efd76" Clp = "e2554f3b-3117-50c0-817c-e040a3ddf72d" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63" Geodesy = "0ef565a4-170c-5f04-8de2-149903a85f3d" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" @@ -17,6 +18,7 @@ JSONSchema = "7d188eb4-7ad8-530c-ae41-71a32a6d4692" JuMP = "4076af6c-e467-56ae-b986-b466b2749572" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee" +OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568" diff --git a/src/instance/geodb.jl b/src/instance/geodb.jl index 118c7d0..2f8f2e9 100644 --- a/src/instance/geodb.jl +++ b/src/instance/geodb.jl @@ -9,6 +9,10 @@ using Shapefile using Statistics using ZipFile using ProgressBars +using OrderedCollections + +import Downloads: download +import Base: parse crc32 = crc(CRC_32) @@ -38,15 +42,28 @@ function centroid(geom::Shapefile.Polygon)::GeoPoint return GeoPoint(round(y_center, digits = 5), round(x_center, digits = 5)) end -function download_zip(url, outputdir, shp_crc32)::Nothing +function _download(url, output, expected_crc32)::Nothing + if isfile(output) + return + end + mkpath(dirname(output)) + @info "Downloading: $url" + fname = download(url) + actual_crc32 = open(crc32, fname) + expected_crc32 == actual_crc32 || error("CRC32 mismatch") + cp(fname, output) + return +end + +function _download_zip(url, outputdir, expected_crc32)::Nothing if isdir(outputdir) return end mkpath(outputdir) @info "Downloading: $url" - zip_filename = download(url) + zip_filename = _download(url) actual_crc32 = open(crc32, zip_filename) - shp_crc32 == actual_crc32 || error("CRC32 mismatch") + expected_crc32 == actual_crc32 || error("CRC32 mismatch") open(zip_filename) do zip_file zr = ZipFile.Reader(zip_file) for file in zr.files @@ -58,33 +75,59 @@ function download_zip(url, outputdir, shp_crc32)::Nothing return end -function geodb_load_gov_census(; +function _geodb_load_gov_census(; db_name, - extract_id, + extract_cols, shp_crc32, shp_filename, shp_url, + population_url, + population_crc32, + population_col = "POPESTIMATE2019", + population_join_key = "STATE", )::Dict{String,GeoRegion} basedir = joinpath(dirname(@__FILE__), "..", "..", "data", db_name) csv_filename = "$basedir/locations.csv" if !isfile(csv_filename) - download_zip(shp_url, basedir, shp_crc32) + # Download required files + _download(population_url, "$basedir/population.csv", population_crc32) + _download_zip(shp_url, basedir, shp_crc32) + + # Read shapefile @info "Processing: $shp_filename" table = Shapefile.Table(joinpath(basedir, shp_filename)) geoms = Shapefile.shapes(table) - df = DataFrame(id = String[], latitude = Float64[], longitude = Float64[]) + + # Build empty dataframe + df = DataFrame() + cols = extract_cols(table, 1) + for k in keys(cols) + df[!, k] = [] + end + df[!, "latitude"] = Float64[] + df[!, "longitude"] = Float64[] + + # Add regions to dataframe for (i, geom) in tqdm(enumerate(geoms)) c = centroid(geom) - id = extract_id(table, i) - push!(df, [id, c.lat, c.lon]) + cols = extract_cols(table, i) + push!(df, [values(cols)..., c.lat, c.lon]) end sort!(df) + + # Join with population data + population = DataFrame(CSV.File("$basedir/population.csv")) + population = population[:, [population_join_key, population_col]] + rename!(population, population_col => "population") + df = leftjoin(df, population, on = population_join_key) + + # Write output CSV.write(csv_filename, df) end if db_name ∉ keys(DB_CACHE) - csv = CSV.File(csv_filename; types = [String, Float64, Float64]) + csv = CSV.File(csv_filename) DB_CACHE[db_name] = Dict( - row.id => GeoRegion( + string(row.id) => GeoRegion( centroid = GeoPoint(row.latitude, row.longitude), population = 0, ) for row in csv @@ -93,52 +136,62 @@ function geodb_load_gov_census(; return DB_CACHE[db_name] end -function _id_2018_us_county(table::Shapefile.Table, i::Int)::String - return table.STATEFP[i] * table.COUNTYFP[i] +function _cols_2018_us_county(table::Shapefile.Table, i::Int)::OrderedDict{String,Any} + return OrderedDict("id" => table.STATEFP[i] * table.COUNTYFP[i]) end -function geodb_load_2018_us_county()::Dict{String,GeoRegion} - return geodb_load_gov_census( +function _geodb_load_2018_us_county()::Dict{String,GeoRegion} + return _geodb_load_gov_census( db_name = "2018-us-county", - extract_id = _id_2018_us_county, + extract_cols = _cols_2018_us_county, shp_crc32 = 0x83eaec6d, shp_filename = "cb_2018_us_county_500k.shp", shp_url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_county_500k.zip", + population_url = "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv", + population_crc32 = 0x191cc64c, ) end -function _id_2018_us_zcta(table::Shapefile.Table, i::Int)::String - return table.ZCTA5CE10[i] +function _cols_2018_us_zcta(table::Shapefile.Table, i::Int)::OrderedDict{String,Any} + return OrderedDict("id" => table.ZCTA5CE10[i]) end -function geodb_load_2018_us_zcta()::Dict{String,GeoRegion} - return geodb_load_gov_census( +function _geodb_load_2018_us_zcta()::Dict{String,GeoRegion} + return _geodb_load_gov_census( db_name = "2018-us-zcta", - extract_id = _id_2018_us_zcta, + extract_cols = _cols_2018_us_zcta, shp_crc32 = 0x6391f5fc, shp_filename = "cb_2018_us_zcta510_500k.shp", shp_url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_zcta510_500k.zip", + population_url = "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv", + population_crc32 = 0x191cc64c, ) end -function _id_us_state(table::Shapefile.Table, i::Int)::String - return table.STUSPS[i] +function _cols_us_state(table::Shapefile.Table, i::Int)::OrderedDict{String,Any} + return OrderedDict( + "id" => table.STUSPS[i], + "STATE" => parse(Int, table.STATEFP[i]), + "name" => table.NAME[i], + ) end -function geodb_load_us_state()::Dict{String,GeoRegion} - return geodb_load_gov_census( +function _geodb_load_us_state()::Dict{String,GeoRegion} + return _geodb_load_gov_census( db_name = "us-state", - extract_id = _id_us_state, + extract_cols = _cols_us_state, shp_crc32 = 0x9469e5ca, shp_filename = "cb_2018_us_state_500k.shp", shp_url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip", + population_url = "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv", + population_crc32 = 0x191cc64c, ) end function geodb_load(db_name::AbstractString)::Dict{String,GeoRegion} - db_name == "2018-us-county" && return geodb_load_2018_us_county() - db_name == "2018-us-zcta" && return geodb_load_2018_us_zcta() - db_name == "us-state" && return geodb_load_us_state() + db_name == "2018-us-county" && return _geodb_load_2018_us_county() + db_name == "2018-us-zcta" && return _geodb_load_2018_us_zcta() + db_name == "us-state" && return _geodb_load_us_state() error("Unknown database: $db_name") end