Files
RELOG/src/instance/geodb.jl
2021-07-16 10:31:21 -05:00

213 lines
6.5 KiB
Julia

# RELOG: Reverse Logistics Optimization
# Copyright (C) 2020, UChicago Argonne, LLC. All rights reserved.
# Released under the modified BSD license. See COPYING.md for more details.
using CRC
using CSV
using DataFrames
using Shapefile
using Statistics
using ZipFile
using ProgressBars
using OrderedCollections
import Downloads: download
import Base: parse
crc32 = crc(CRC_32)
struct GeoPoint
lat::Float64
lon::Float64
end
struct GeoRegion
centroid::GeoPoint
population::Int
GeoRegion(; centroid, population) = new(centroid, population)
end
DB_CACHE = Dict{String,Dict{String,GeoRegion}}()
function centroid(geom::Shapefile.Polygon)::GeoPoint
x_max, x_min, y_max, y_min = -Inf, Inf, -Inf, Inf
for p in geom.points
x_max = max(x_max, p.x)
x_min = min(x_min, p.x)
y_max = max(y_max, p.y)
y_min = min(y_min, p.y)
end
x_center = (x_max + x_min) / 2.0
y_center = (y_max + y_min) / 2.0
return GeoPoint(round(y_center, digits = 5), round(x_center, digits = 5))
end
function _download_file(url, output, expected_crc32)::Nothing
if isfile(output)
return
end
mkpath(dirname(output))
@info "Downloading: $url"
fname = download(url)
actual_crc32 = open(crc32, fname)
expected_crc32 == actual_crc32 || error("CRC32 mismatch")
cp(fname, output)
return
end
function _download_zip(url, outputdir, expected_output_file, expected_crc32)::Nothing
if isfile(expected_output_file)
return
end
mkpath(outputdir)
@info "Downloading: $url"
zip_filename = download(url)
actual_crc32 = open(crc32, zip_filename)
expected_crc32 == actual_crc32 || error("CRC32 mismatch")
open(zip_filename) do zip_file
zr = ZipFile.Reader(zip_file)
for file in zr.files
open(joinpath(outputdir, file.name), "w") do output_file
write(output_file, read(file))
end
end
end
return
end
function _geodb_load_gov_census(;
db_name,
extract_cols,
shp_crc32,
shp_filename,
shp_url,
population_url,
population_crc32,
population_col,
population_preprocess,
population_join,
)::Dict{String,GeoRegion}
basedir = joinpath(dirname(@__FILE__), "..", "..", "data", db_name)
csv_filename = "$basedir/locations.csv"
if !isfile(csv_filename)
# Download required files
_download_zip(shp_url, basedir, joinpath(basedir, shp_filename), shp_crc32)
_download_file(population_url, "$basedir/population.csv", population_crc32)
# Read shapefile
@info "Processing: $shp_filename"
table = Shapefile.Table(joinpath(basedir, shp_filename))
geoms = Shapefile.shapes(table)
# Build empty dataframe
df = DataFrame()
cols = extract_cols(table, 1)
for k in keys(cols)
df[!, k] = []
end
df[!, "latitude"] = Float64[]
df[!, "longitude"] = Float64[]
# Add regions to dataframe
for (i, geom) in tqdm(enumerate(geoms))
c = centroid(geom)
cols = extract_cols(table, i)
push!(df, [values(cols)..., c.lat, c.lon])
end
sort!(df)
# Join with population data
population = DataFrame(CSV.File("$basedir/population.csv"))
population_preprocess(population)
population = population[:, [population_join, population_col]]
rename!(population, population_col => "population")
df = leftjoin(df, population, on = population_join)
# Write output
CSV.write(csv_filename, df)
end
if db_name keys(DB_CACHE)
csv = CSV.File(csv_filename)
DB_CACHE[db_name] = Dict(
string(row.id) => GeoRegion(
centroid = GeoPoint(row.latitude, row.longitude),
population = (row.population === missing ? 0 : row.population),
) for row in csv
)
end
return DB_CACHE[db_name]
end
# 2018 US counties
# -----------------------------------------------------------------------------
function _extract_cols_2018_us_county(
table::Shapefile.Table,
i::Int,
)::OrderedDict{String,Any}
return OrderedDict(
"id" => table.STATEFP[i] * table.COUNTYFP[i],
"statefp" => table.STATEFP[i],
"countyfp" => table.COUNTYFP[i],
"name" => table.NAME[i],
)
end
function _population_preprocess_2018_us_county(df)
df[!, "id"] = [@sprintf("%02d%03d", row.STATE, row.COUNTY) for row in eachrow(df)]
end
function _geodb_load_2018_us_county()::Dict{String,GeoRegion}
return _geodb_load_gov_census(
db_name = "2018-us-county",
extract_cols = _extract_cols_2018_us_county,
shp_crc32 = 0x83eaec6d,
shp_filename = "cb_2018_us_county_500k.shp",
shp_url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_county_500k.zip",
population_url = "https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv",
population_crc32 = 0xf85b0405,
population_col = "POPESTIMATE2019",
population_join = "id",
population_preprocess = _population_preprocess_2018_us_county,
)
end
# US States
# -----------------------------------------------------------------------------
function _extract_cols_us_state(table::Shapefile.Table, i::Int)::OrderedDict{String,Any}
return OrderedDict(
"id" => table.STUSPS[i],
"statefp" => parse(Int, table.STATEFP[i]),
"name" => table.NAME[i],
)
end
function _population_preprocess_us_state(df)
rename!(df, "STATE" => "statefp")
end
function _geodb_load_us_state()::Dict{String,GeoRegion}
return _geodb_load_gov_census(
db_name = "us-state",
extract_cols = _extract_cols_us_state,
shp_crc32 = 0x9469e5ca,
shp_filename = "cb_2018_us_state_500k.shp",
shp_url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip",
population_url = "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv",
population_crc32 = 0x191cc64c,
population_col = "POPESTIMATE2019",
population_join = "statefp",
population_preprocess = _population_preprocess_us_state,
)
end
function geodb_load(db_name::AbstractString)::Dict{String,GeoRegion}
db_name == "2018-us-county" && return _geodb_load_2018_us_county()
db_name == "us-state" && return _geodb_load_us_state()
error("Unknown database: $db_name")
end
function geodb_query(name)::GeoRegion
db_name, id = split(name, ":")
return geodb_load(db_name)[id]
end