Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,21 @@ if [[ $# -lt 2 ]]; then
exit 1
fi

TEST=0
for arg in "$@"; do
case $arg in
--test) TEST=1 ;;
esac
done

echo "Extracting feature layers from $1"
./process.sh "$@"

for input_file in "$2"/*.parquet; do
echo "Sorting and compressing $input_file"
output_file="$2/$(basename -s .parquet $input_file)-optimized.parquet"
./postprocess.sh $input_file $output_file
mv $output_file $input_file
done
if [ "$TEST" = "0" ]; then
for input_file in "$2"/*.parquet; do
echo "Sorting and compressing $input_file"
output_file="$2/$(basename -s .parquet $input_file)-optimized.parquet"
./postprocess.sh $input_file $output_file
mv $output_file $input_file
done
fi
41 changes: 29 additions & 12 deletions process.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# using DuckDB with the osmium and spatial extensions.
#
# Usage: process.sh <input.osm.pbf> <output_dir> [--buildings] [--highways] ...
# [--osmium-index-type=TYPE] [--duckdb-memory-limit=LIMIT]
# [--test] [--osmium-index-type=TYPE] [--duckdb-memory-limit=LIMIT]
#
# --osmium-index-type sets osmium's node location index type. The default
# is 'flex_mem' which works well for both small and large extracts. For
Expand All @@ -27,8 +27,9 @@ OUTPUT_DIR="$2"
shift 2

# Parse optional layer flags
BUILDINGS=0; HIGHWAYS=0; BOUNDARIES=0; SETTLEMENTS=0; PARKS=0
BUILDINGS=0; HIGHWAYS=0; BOUNDARIES=0; SETTLEMENTS=0; PARKS=0; WATER=0
ALL=1
TEST=0
OSMIUM_INDEX_TYPE=""
DUCKDB_MEMORY_LIMIT=""

Expand All @@ -39,14 +40,16 @@ for arg in "$@"; do
--boundaries) BOUNDARIES=1; ALL=0 ;;
--settlements) SETTLEMENTS=1; ALL=0 ;;
--parks) PARKS=1; ALL=0 ;;
--water) WATER=1; ALL=0 ;;
--test) TEST=1 ;;
--osmium-index-type=*) OSMIUM_INDEX_TYPE="${arg#*=}" ;;
--duckdb-memory-limit=*) DUCKDB_MEMORY_LIMIT="${arg#*=}" ;;
*) echo "Unknown argument: $arg" >&2; exit 1 ;;
esac
done

if [ "$ALL" = "1" ]; then
BUILDINGS=1; HIGHWAYS=1; BOUNDARIES=1; SETTLEMENTS=1; PARKS=1
BUILDINGS=1; HIGHWAYS=1; BOUNDARIES=1; SETTLEMENTS=1; PARKS=1; WATER=1
fi

mkdir -p "$OUTPUT_DIR"
Expand All @@ -56,21 +59,35 @@ mkdir -p "$OUTPUT_DIR"
run_layer() {
local name="$1"
local output="${OUTPUT_DIR}/${name}.parquet"
echo "Extracting ${name} layer"
{
cat "${SCRIPT_DIR}/sql/macros.sql"
[ -n "$DUCKDB_MEMORY_LIMIT" ] && echo "SET memory_limit = '${DUCKDB_MEMORY_LIMIT}';"
[ -n "$OSMIUM_INDEX_TYPE" ] && echo "SET osmium_index_type = '${OSMIUM_INDEX_TYPE}';"
cat "${SCRIPT_DIR}/sql/${name}.sql"
} | \
sed "s|{{INPUT}}|${INPUT}|g; s|{{OUTPUT}}|${output}|g" | \
duckdb --unsigned
local script_file;

if [ "$TEST" = "1" ]; then
local script_file="${SCRIPT_DIR}/sql/${name}_test.sql"
if [ -f "$script_file" ]; then
echo "Testing ${name} layer"
fi
else
echo "Extracting ${name} layer"
local script_file="${SCRIPT_DIR}/sql/${name}.sql"
fi

if [ -f "$script_file" ]; then
{
cat "${SCRIPT_DIR}/sql/macros.sql"
[ -n "$DUCKDB_MEMORY_LIMIT" ] && echo "SET memory_limit = '${DUCKDB_MEMORY_LIMIT}';"
[ -n "$OSMIUM_INDEX_TYPE" ] && echo "SET osmium_index_type = '${OSMIUM_INDEX_TYPE}';"
cat "$script_file"
} | \
sed "s|{{INPUT}}|${INPUT}|g; s|{{OUTPUT}}|${output}|g" | \
duckdb --unsigned
fi
}

[ "$BUILDINGS" = "1" ] && run_layer buildings
[ "$HIGHWAYS" = "1" ] && run_layer highways
[ "$BOUNDARIES" = "1" ] && run_layer boundaries
[ "$SETTLEMENTS" = "1" ] && run_layer settlements
[ "$PARKS" = "1" ] && run_layer parks
[ "$WATER" = "1" ] && run_layer water

echo "Done"
44 changes: 44 additions & 0 deletions sql/macros.sql
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,47 @@ CREATE OR REPLACE MACRO prefix_map_split(pfx, t) AS (
)
)
);

CREATE OR REPLACE MACRO assert_col_not_empty(table_name, col_name) AS (
SELECT CASE
WHEN NOT EXISTS (
-- Must rewrap the col_name in an extra layer of quoting.
SELECT 1 FROM query('SELECT 1 FROM ' || table_name || ' WHERE "' || col_name || '" IS NOT NULL LIMIT 1')
)
THEN CAST(error('Assertion Failed: Empty column: ' || col_name) AS INTEGER)
ELSE 1
END
);

CREATE OR REPLACE MACRO assert_map_not_empty(table_name, col_name) AS (
SELECT CASE
WHEN NOT EXISTS (
-- Must rewrap the col_name in an extra layer of quoting.
SELECT 1 FROM query('SELECT 1 FROM ' || table_name || ' WHERE cardinality("' || col_name || '") > 0 LIMIT 1')
)
THEN CAST(error('Assertion Failed: Empty map found for column: ' || col_name) AS INTEGER)
ELSE 1
END
);

CREATE OR REPLACE MACRO assert_tag_not_empty(table_name, k, v) AS (
SELECT CASE
WHEN NOT EXISTS (
SELECT 1 FROM query(
printf('SELECT 1 FROM %s WHERE "%s" = ''%s'' LIMIT 1',table_name, k, v)
)
)
THEN CAST(error(printf('Assertion Failed: Empty tag for "%s"=''%s''', k, v)) AS INTEGER)
ELSE 1
END
);

CREATE OR REPLACE MACRO assert_stmt_not_empty(stmt) AS (
SELECT CASE
WHEN NOT EXISTS (
SELECT 1 FROM query(stmt)
)
THEN CAST(error(printf('Assertion Failed: stmt ''%s'' produced no values', stmt)) AS INTEGER)
ELSE 1
END
);
151 changes: 151 additions & 0 deletions sql/water.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
CREATE OR REPLACE TEMP TABLE bridges_unfiltered AS
SELECT type, id, tags, geometry
FROM '{{INPUT}}'
WHERE tags['man_made'] = 'bridge' OR tags['bridge'] IS NOT NULL;

CREATE OR REPLACE TEMP TABLE water_features AS
SELECT type, id, tags, geometry
FROM '{{INPUT}}'
WHERE (
(kind = 'line' AND tags['waterway'] IS NOT NULL) OR
(kind = 'area' AND (
tags['natural'] IN ('water', 'coastline', 'wetland') OR
tags['landuse'] IN ('basin', 'reservoir', 'harbour') OR
tags['waterway'] IS NOT NULL
)) OR
tags['man_made'] IN ('pier', 'breakwater', 'groyne', 'lighthouse', 'beacon', 'buoy', 'offshore_platform', 'pumping_station', 'water_well', 'spring') OR
(
tags['man_made'] = 'monitoring_station' AND (
tags['monitoring:water'] IS NOT NULL OR
tags['monitoring:water_level'] IS NOT NULL OR
tags['monitoring:water_quality'] IS NOT NULL
)
) OR
tags['historic'] IN ('wreck','ship', 'aquaduct') OR
tags['seamark:type'] IS NOT NULL OR
tags['route'] IN ('ferry', 'portage') OR
tags['leisure'] IN ('slipway', 'marina', 'swimming_pool', 'swimming_area', 'water_park') OR
tags['amenity'] IN ('drinking_water', 'foot_shower', 'shower', 'boat_rental') OR
tags['sport'] IN ('canoe', 'cliff_diving', 'diving', 'dragon_boat', 'rowing', 'sailing', 'scuba_diving', 'surfing', 'swimming', 'wakeboarding', 'water_ski', 'windsurfing') OR
tags['portage'] IS NOT NULL OR
tags['canoe'] IS NOT NULL OR
tags['canoe_rental'] IS NOT NULL OR
tags['mooring'] IS NOT NULL OR
(tags['landuse'] = 'industrial' AND tags['industrial'] = 'port') OR
( -- All deprecated in favor of tags['emergency'] = 'water_rescue'
tags['emergency'] IN ('lifeboat_station', 'marine_rescue') OR
tags['amenity'] = 'lifeboat_station'
) OR
tags['emergency'] IN ( 'lifeguard', 'water_rescue', 'life_ring', 'throw_bag', 'rescue_buoy') OR
(
tags['emergency'] = 'assembly_point' AND (
-- Unfortunately not all tsunami assembly points have the correct assembly_point:tsunami tag
-- https://www.openstreetmap.org/node/4368193931
tags['assembly_point:tsunami'] IS NOT NULL OR
tags['assembly_point:storm_surge'] IS NOT NULL
)
) OR
tags['ford'] IS NOT NULL OR
tags['tidal'] IS NOT NULL OR
tags['flood_prone'] IS NOT NULL OR
tags['whitewater'] IS NOT NULL OR
tags['club'] in ('sailing', 'scuba_diving', 'surf_life_saving') OR
tags['shop'] = 'boat' OR
tags['boat:type'] IS NOT NULL
);

-- Known to exclude https://www.openstreetmap.org/way/35457618 from the Oregon region
CREATE OR REPLACE TEMP TABLE water_bridges AS
SELECT b.type, b.id, b.tags, b.geometry
FROM bridges_unfiltered b
JOIN water_features w
ON b.geometry && w.geometry
WHERE ST_Intersects(b.geometry, w.geometry);

COPY (
WITH raw AS (
-- We call `SELECT DISTINCT` here instead of when the building of the "water_bridges" table
-- to work around a floating point exception which randomly goes away if you `PRAGMA threads = 1`
SELECT DISTINCT type, id, tags, geometry FROM water_bridges

UNION ALL
SELECT type, id, tags, geometry FROM water_features
)
SELECT
type,
id,
tags['natural'] AS "natural",
tags['waterway'] AS waterway,
tags['man_made'] AS man_made,
tags['historic'] AS historic,
tags['route'] AS route,
tags['intermittent'] AS intermittent,
tags['tunnel'] AS tunnel,
tags['covered'] AS covered,
tags['salt'] AS salt,
tags['boat'] AS boat,
tags['motorboat'] AS motorboat,
tags['canoe'] AS canoe,
tags['highway'] AS highway,
tags['portage'] AS portage,
tags['mooring'] AS mooring,
tags['seasonal'] AS seasonal,
tags['water'] AS water,
tags['bridge'] AS bridge,
tags['lifeguard'] AS lifeguard,
tags['emergency'] AS emergency,
tags['landuse'] AS landuse,
tags['industrial'] AS industrial,
tags['amenity'] AS amenity,
tags['leisure'] AS leisure,
tags['access'] AS access,
tags['fee'] AS fee,
tags['surface'] AS surface,
tags['ford'] AS ford,
tags['tidal'] AS tidal,
tags['flood_prone'] AS flood_prone,
tags['sport'] AS sport,
tags['wheelchair'] AS wheelchair,
tags['club'] AS club,
tags['whitewater'] AS whitewater,
tags['shop'] AS shop,
tags['canoe_rental'] AS canoe_rental,
tags['boat'] AS boat,
tags['ship'] AS ship,
tags['pump'] AS pump, -- For man_made=water_well
tags['drinking_water'] AS drinking_water, -- For man_made=water_well
tags['handle'] AS handle, -- For man_made=water_well
tags['mechanical_driver'] AS mechanical_driver, -- For man_made=water_well
tags['depth'] AS depth, -- For man_made=water_well
tags['mechanism'] AS mechanism, -- For man_made=water_well
prefix_map('pump:', tags) AS "pump:", -- For man_made=water_well
prefix_map('seamark:', tags) AS "seamark:",
prefix_map('assembly_point:', tags) AS "assembly_point:",
prefix_map('monitoring:', tags) AS "monitoring:",
prefix_map('whitewater:', tags) AS "whitewater:",
prefix_map('addr:', tags) AS "addr:",
prefix_map_split('boat:', tags) AS "boat:",
split_multi(tags['name']) AS name,
prefix_map_split('name:', tags) AS names,
split_multi(tags['official_name']) AS official_name,
prefix_map_split('official_name:', tags) AS official_names,
split_multi(tags['old_name']) AS old_name,
prefix_map_split('old_name:', tags) AS old_names,
split_multi(tags['alt_name']) AS alt_name,
prefix_map_split('alt_name:', tags) AS alt_names,
split_multi(tags['short_name']) AS short_name,
prefix_map_split('short_name:', tags) AS short_names,
tags['operator'] AS operator,
tags['description'] AS description,
tags['source'] AS source,
tags['wikidata'] AS wikidata,
tags['wikipedia'] AS wikipedia,
{
xmin: ST_XMin(geometry)::FLOAT,
ymin: ST_YMin(geometry)::FLOAT,
xmax: ST_XMax(geometry)::FLOAT,
ymax: ST_YMax(geometry)::FLOAT
} AS bbox,
geometry
FROM raw
) TO '{{OUTPUT}}' WITH (FORMAT PARQUET, COMPRESSION ZSTD);
Loading