Download and join population

feature/geodb
Alinson S. Xavier 4 years ago
parent 33ab4c5f76
commit e407a53ecf

@ -10,6 +10,7 @@ Cbc = "9961bab8-2fa3-5c5a-9d89-47fab24efd76"
Clp = "e2554f3b-3117-50c0-817c-e040a3ddf72d" Clp = "e2554f3b-3117-50c0-817c-e040a3ddf72d"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63" GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
Geodesy = "0ef565a4-170c-5f04-8de2-149903a85f3d" Geodesy = "0ef565a4-170c-5f04-8de2-149903a85f3d"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
@ -17,6 +18,7 @@ JSONSchema = "7d188eb4-7ad8-530c-ae41-71a32a6d4692"
JuMP = "4076af6c-e467-56ae-b986-b466b2749572" JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee" MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d" PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568" ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568"

@ -9,6 +9,10 @@ using Shapefile
using Statistics using Statistics
using ZipFile using ZipFile
using ProgressBars using ProgressBars
using OrderedCollections
import Downloads: download
import Base: parse
crc32 = crc(CRC_32) crc32 = crc(CRC_32)
@ -38,15 +42,28 @@ function centroid(geom::Shapefile.Polygon)::GeoPoint
return GeoPoint(round(y_center, digits = 5), round(x_center, digits = 5)) return GeoPoint(round(y_center, digits = 5), round(x_center, digits = 5))
end end
function download_zip(url, outputdir, shp_crc32)::Nothing function _download(url, output, expected_crc32)::Nothing
if isfile(output)
return
end
mkpath(dirname(output))
@info "Downloading: $url"
fname = download(url)
actual_crc32 = open(crc32, fname)
expected_crc32 == actual_crc32 || error("CRC32 mismatch")
cp(fname, output)
return
end
function _download_zip(url, outputdir, expected_crc32)::Nothing
if isdir(outputdir) if isdir(outputdir)
return return
end end
mkpath(outputdir) mkpath(outputdir)
@info "Downloading: $url" @info "Downloading: $url"
zip_filename = download(url) zip_filename = _download(url)
actual_crc32 = open(crc32, zip_filename) actual_crc32 = open(crc32, zip_filename)
shp_crc32 == actual_crc32 || error("CRC32 mismatch") expected_crc32 == actual_crc32 || error("CRC32 mismatch")
open(zip_filename) do zip_file open(zip_filename) do zip_file
zr = ZipFile.Reader(zip_file) zr = ZipFile.Reader(zip_file)
for file in zr.files for file in zr.files
@ -58,33 +75,59 @@ function download_zip(url, outputdir, shp_crc32)::Nothing
return return
end end
function geodb_load_gov_census(; function _geodb_load_gov_census(;
db_name, db_name,
extract_id, extract_cols,
shp_crc32, shp_crc32,
shp_filename, shp_filename,
shp_url, shp_url,
population_url,
population_crc32,
population_col = "POPESTIMATE2019",
population_join_key = "STATE",
)::Dict{String,GeoRegion} )::Dict{String,GeoRegion}
basedir = joinpath(dirname(@__FILE__), "..", "..", "data", db_name) basedir = joinpath(dirname(@__FILE__), "..", "..", "data", db_name)
csv_filename = "$basedir/locations.csv" csv_filename = "$basedir/locations.csv"
if !isfile(csv_filename) if !isfile(csv_filename)
download_zip(shp_url, basedir, shp_crc32) # Download required files
_download(population_url, "$basedir/population.csv", population_crc32)
_download_zip(shp_url, basedir, shp_crc32)
# Read shapefile
@info "Processing: $shp_filename" @info "Processing: $shp_filename"
table = Shapefile.Table(joinpath(basedir, shp_filename)) table = Shapefile.Table(joinpath(basedir, shp_filename))
geoms = Shapefile.shapes(table) geoms = Shapefile.shapes(table)
df = DataFrame(id = String[], latitude = Float64[], longitude = Float64[])
# Build empty dataframe
df = DataFrame()
cols = extract_cols(table, 1)
for k in keys(cols)
df[!, k] = []
end
df[!, "latitude"] = Float64[]
df[!, "longitude"] = Float64[]
# Add regions to dataframe
for (i, geom) in tqdm(enumerate(geoms)) for (i, geom) in tqdm(enumerate(geoms))
c = centroid(geom) c = centroid(geom)
id = extract_id(table, i) cols = extract_cols(table, i)
push!(df, [id, c.lat, c.lon]) push!(df, [values(cols)..., c.lat, c.lon])
end end
sort!(df) sort!(df)
# Join with population data
population = DataFrame(CSV.File("$basedir/population.csv"))
population = population[:, [population_join_key, population_col]]
rename!(population, population_col => "population")
df = leftjoin(df, population, on = population_join_key)
# Write output
CSV.write(csv_filename, df) CSV.write(csv_filename, df)
end end
if db_name keys(DB_CACHE) if db_name keys(DB_CACHE)
csv = CSV.File(csv_filename; types = [String, Float64, Float64]) csv = CSV.File(csv_filename)
DB_CACHE[db_name] = Dict( DB_CACHE[db_name] = Dict(
row.id => GeoRegion( string(row.id) => GeoRegion(
centroid = GeoPoint(row.latitude, row.longitude), centroid = GeoPoint(row.latitude, row.longitude),
population = 0, population = 0,
) for row in csv ) for row in csv
@ -93,52 +136,62 @@ function geodb_load_gov_census(;
return DB_CACHE[db_name] return DB_CACHE[db_name]
end end
function _id_2018_us_county(table::Shapefile.Table, i::Int)::String function _cols_2018_us_county(table::Shapefile.Table, i::Int)::OrderedDict{String,Any}
return table.STATEFP[i] * table.COUNTYFP[i] return OrderedDict("id" => table.STATEFP[i] * table.COUNTYFP[i])
end end
function geodb_load_2018_us_county()::Dict{String,GeoRegion} function _geodb_load_2018_us_county()::Dict{String,GeoRegion}
return geodb_load_gov_census( return _geodb_load_gov_census(
db_name = "2018-us-county", db_name = "2018-us-county",
extract_id = _id_2018_us_county, extract_cols = _cols_2018_us_county,
shp_crc32 = 0x83eaec6d, shp_crc32 = 0x83eaec6d,
shp_filename = "cb_2018_us_county_500k.shp", shp_filename = "cb_2018_us_county_500k.shp",
shp_url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_county_500k.zip", shp_url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_county_500k.zip",
population_url = "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv",
population_crc32 = 0x191cc64c,
) )
end end
function _id_2018_us_zcta(table::Shapefile.Table, i::Int)::String function _cols_2018_us_zcta(table::Shapefile.Table, i::Int)::OrderedDict{String,Any}
return table.ZCTA5CE10[i] return OrderedDict("id" => table.ZCTA5CE10[i])
end end
function geodb_load_2018_us_zcta()::Dict{String,GeoRegion} function _geodb_load_2018_us_zcta()::Dict{String,GeoRegion}
return geodb_load_gov_census( return _geodb_load_gov_census(
db_name = "2018-us-zcta", db_name = "2018-us-zcta",
extract_id = _id_2018_us_zcta, extract_cols = _cols_2018_us_zcta,
shp_crc32 = 0x6391f5fc, shp_crc32 = 0x6391f5fc,
shp_filename = "cb_2018_us_zcta510_500k.shp", shp_filename = "cb_2018_us_zcta510_500k.shp",
shp_url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_zcta510_500k.zip", shp_url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_zcta510_500k.zip",
population_url = "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv",
population_crc32 = 0x191cc64c,
) )
end end
function _id_us_state(table::Shapefile.Table, i::Int)::String function _cols_us_state(table::Shapefile.Table, i::Int)::OrderedDict{String,Any}
return table.STUSPS[i] return OrderedDict(
"id" => table.STUSPS[i],
"STATE" => parse(Int, table.STATEFP[i]),
"name" => table.NAME[i],
)
end end
function geodb_load_us_state()::Dict{String,GeoRegion} function _geodb_load_us_state()::Dict{String,GeoRegion}
return geodb_load_gov_census( return _geodb_load_gov_census(
db_name = "us-state", db_name = "us-state",
extract_id = _id_us_state, extract_cols = _cols_us_state,
shp_crc32 = 0x9469e5ca, shp_crc32 = 0x9469e5ca,
shp_filename = "cb_2018_us_state_500k.shp", shp_filename = "cb_2018_us_state_500k.shp",
shp_url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip", shp_url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip",
population_url = "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv",
population_crc32 = 0x191cc64c,
) )
end end
function geodb_load(db_name::AbstractString)::Dict{String,GeoRegion} function geodb_load(db_name::AbstractString)::Dict{String,GeoRegion}
db_name == "2018-us-county" && return geodb_load_2018_us_county() db_name == "2018-us-county" && return _geodb_load_2018_us_county()
db_name == "2018-us-zcta" && return geodb_load_2018_us_zcta() db_name == "2018-us-zcta" && return _geodb_load_2018_us_zcta()
db_name == "us-state" && return geodb_load_us_state() db_name == "us-state" && return _geodb_load_us_state()
error("Unknown database: $db_name") error("Unknown database: $db_name")
end end

Loading…
Cancel
Save