In [ ]:
Copied!
import urban_mapper as um
from urban_mapper.pipeline import UrbanPipeline
import pandas as pd
import urban_mapper as um
from urban_mapper.pipeline import UrbanPipeline
import pandas as pd
In [ ]:
Copied!
import urban_mapper as um
from urban_mapper.pipeline import UrbanPipeline
data = (
um.UrbanMapper()
.loader
.from_huggingface("oscur/NYC_vehicle_collisions")
.with_columns(longitude_column="LONGITUDE", latitude_column="LATITUDE")
.load()
)
data['LONGITUDE'] = data['LONGITUDE'].astype(float)
data['LATITUDE'] = data['LATITUDE'].astype(float)
data.to_csv("./NYC_Motor_Vehicle_Collisions_Mar_12_2025.csv")
import urban_mapper as um
from urban_mapper.pipeline import UrbanPipeline
data = (
um.UrbanMapper()
.loader
.from_huggingface("oscur/NYC_vehicle_collisions")
.with_columns(longitude_column="LONGITUDE", latitude_column="LATITUDE")
.load()
)
data['LONGITUDE'] = data['LONGITUDE'].astype(float)
data['LATITUDE'] = data['LATITUDE'].astype(float)
data.to_csv("./NYC_Motor_Vehicle_Collisions_Mar_12_2025.csv")
In [ ]:
Copied!
#####################################################################################
# ⚠️ INFORMAITON ABOUT THE CURRENT CELL ⚠️
# The following shows custom aggregation functions
# used later on in the pipeline
#####################################################################################
def most_common_factor(series):
if series.empty:
return None
mode = series.mode()
return mode.iloc[0] if not mode.empty else None
def proportion_pedestrian_injuries(series):
total_collisions = len(series)
if total_collisions == 0:
return 0
pedestrian_injuries = (series > 0).sum()
return pedestrian_injuries / total_collisions
def max_injuries(series):
return series.max() if not series.empty else 0
def most_common_vehicle(series):
if series.empty:
return None
mode = series.mode()
return mode.iloc[0] if not mode.empty else None
def injury_variance(series):
return series.var() if len(series) > 1 else 0
def most_common_day(series):
if series.empty:
return None
if not pd.api.types.is_datetime64_any_dtype(series):
try:
series = pd.to_datetime(series)
except Exception as e:
raise ValueError(f"Could not convert series to datetime: {e}")
days = series.dt.dayofweek
mode = days.mode()
day_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
return day_names[mode.iloc[0]] if not mode.empty else None
def proportion_night_collisions(series):
if series.empty:
return 0
try:
times = series.apply(lambda x: int(x.split(":")[0]) + int(x.split(":")[1]) / 60)
except ValueError:
raise ValueError("Ensure all times are in 'HH:MM' format.")
night_mask = (times >= 18.0) | (times < 6.0)
return night_mask.mean()
def average_motorist_injuries(series):
return series.mean() if not series.empty else 0
def factor_diversity(series):
if series.empty:
return 0
unique_factors = series.stack().dropna().unique()
return len(unique_factors)
def proportion_fatal_collisions(series):
if series.empty:
return 0
fatal_collisions = (series > 0).sum()
return fatal_collisions / len(series)
def most_common_vehicle_pair(series):
if series.empty:
return None
pairs = series[["VEHICLE TYPE CODE 1", "VEHICLE TYPE CODE 2"]].dropna().apply(
lambda x: tuple(sorted([x["VEHICLE TYPE CODE 1"], x["VEHICLE TYPE CODE 2"]])), axis=1
)
mode = pairs.mode()
return mode.iloc[0] if not mode.empty else None
#####################################################################################
# ⚠️ INFORMAITON ABOUT THE CURRENT CELL ⚠️
# The following shows custom aggregation functions
# used later on in the pipeline
#####################################################################################
def most_common_factor(series):
if series.empty:
return None
mode = series.mode()
return mode.iloc[0] if not mode.empty else None
def proportion_pedestrian_injuries(series):
total_collisions = len(series)
if total_collisions == 0:
return 0
pedestrian_injuries = (series > 0).sum()
return pedestrian_injuries / total_collisions
def max_injuries(series):
return series.max() if not series.empty else 0
def most_common_vehicle(series):
if series.empty:
return None
mode = series.mode()
return mode.iloc[0] if not mode.empty else None
def injury_variance(series):
return series.var() if len(series) > 1 else 0
def most_common_day(series):
if series.empty:
return None
if not pd.api.types.is_datetime64_any_dtype(series):
try:
series = pd.to_datetime(series)
except Exception as e:
raise ValueError(f"Could not convert series to datetime: {e}")
days = series.dt.dayofweek
mode = days.mode()
day_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
return day_names[mode.iloc[0]] if not mode.empty else None
def proportion_night_collisions(series):
if series.empty:
return 0
try:
times = series.apply(lambda x: int(x.split(":")[0]) + int(x.split(":")[1]) / 60)
except ValueError:
raise ValueError("Ensure all times are in 'HH:MM' format.")
night_mask = (times >= 18.0) | (times < 6.0)
return night_mask.mean()
def average_motorist_injuries(series):
return series.mean() if not series.empty else 0
def factor_diversity(series):
if series.empty:
return 0
unique_factors = series.stack().dropna().unique()
return len(unique_factors)
def proportion_fatal_collisions(series):
if series.empty:
return 0
fatal_collisions = (series > 0).sum()
return fatal_collisions / len(series)
def most_common_vehicle_pair(series):
if series.empty:
return None
pairs = series[["VEHICLE TYPE CODE 1", "VEHICLE TYPE CODE 2"]].dropna().apply(
lambda x: tuple(sorted([x["VEHICLE TYPE CODE 1"], x["VEHICLE TYPE CODE 2"]])), axis=1
)
mode = pairs.mode()
return mode.iloc[0] if not mode.empty else None
In [ ]:
Copied!
pipeline = UrbanPipeline([
("urban_layer", (
um.UrbanMapper().urban_layer
.with_type("streets_intersections")
.from_place("Downtown Brooklyn, New York City, USA", network_type="drive")
.with_mapping(
longitude_column="LONGITUDE",
latitude_column="LATITUDE",
output_column="nearest_intersection"
)
.build()
)),
("loader", (
um.UrbanMapper().loader
.from_file("./NYC_Motor_Vehicle_Collisions_Mar_12_2025.csv")
.with_columns(longitude_column="LONGITUDE", latitude_column="LATITUDE")
.build()
)),
("imputer", (
um.UrbanMapper().imputer
.with_type("SimpleGeoImputer")
.on_columns("LONGITUDE", "LATITUDE")
.build()
)),
("filter", um.UrbanMapper().filter.with_type("BoundingBoxFilter").build()),
("enrich_injuries", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="NUMBER OF PERSONS INJURED")
.aggregate_by(method="sum", output_column="total_injuries")
.build()
)),
("enrich_fatalities", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="NUMBER OF PERSONS KILLED")
.aggregate_by(method="sum", output_column="total_fatalities")
.build()
)),
("enrich_factors", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="CONTRIBUTING FACTOR VEHICLE 1")
.aggregate_by(method=most_common_factor, output_column="most_common_factor")
.build()
)),
("enrich_pedestrian_prop", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="NUMBER OF PEDESTRIANS INJURED")
.aggregate_by(method=proportion_pedestrian_injuries, output_column="prop_pedestrian_injuries")
.build()
)),
("enrich_max_injuries", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="NUMBER OF PERSONS INJURED")
.aggregate_by(method=max_injuries, output_column="max_injuries_in_single_collision")
.build()
)),
("enrich_vehicles", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="VEHICLE TYPE CODE 1")
.aggregate_by(method=most_common_vehicle, output_column="most_common_vehicle")
.build()
)),
("enrich_injury_var", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="NUMBER OF PERSONS INJURED")
.aggregate_by(method=injury_variance, output_column="injury_variance")
.build()
)),
("enrich_peak_day", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="CRASH DATE")
.aggregate_by(method=most_common_day, output_column="peak_collision_day")
.build()
)),
("enrich_night_prop", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="CRASH TIME")
.aggregate_by(method=proportion_night_collisions, output_column="prop_night_collisions")
.build()
)),
("enrich_avg_motorist_injuries", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="NUMBER OF MOTORIST INJURED")
.aggregate_by(method=average_motorist_injuries, output_column="avg_motorist_injuries_per_collision")
.build()
)),
("enrich_count_collisions", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection")
.count_by(output_column="collision_count")
.build()
)),
("visualiser", (
um.UrbanMapper().visual
.with_type("Interactive")
.with_style({"tiles": "CartoDB dark_matter", "colorbar_text_color": "white"})
.build()
))
])
pipeline = UrbanPipeline([
("urban_layer", (
um.UrbanMapper().urban_layer
.with_type("streets_intersections")
.from_place("Downtown Brooklyn, New York City, USA", network_type="drive")
.with_mapping(
longitude_column="LONGITUDE",
latitude_column="LATITUDE",
output_column="nearest_intersection"
)
.build()
)),
("loader", (
um.UrbanMapper().loader
.from_file("./NYC_Motor_Vehicle_Collisions_Mar_12_2025.csv")
.with_columns(longitude_column="LONGITUDE", latitude_column="LATITUDE")
.build()
)),
("imputer", (
um.UrbanMapper().imputer
.with_type("SimpleGeoImputer")
.on_columns("LONGITUDE", "LATITUDE")
.build()
)),
("filter", um.UrbanMapper().filter.with_type("BoundingBoxFilter").build()),
("enrich_injuries", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="NUMBER OF PERSONS INJURED")
.aggregate_by(method="sum", output_column="total_injuries")
.build()
)),
("enrich_fatalities", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="NUMBER OF PERSONS KILLED")
.aggregate_by(method="sum", output_column="total_fatalities")
.build()
)),
("enrich_factors", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="CONTRIBUTING FACTOR VEHICLE 1")
.aggregate_by(method=most_common_factor, output_column="most_common_factor")
.build()
)),
("enrich_pedestrian_prop", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="NUMBER OF PEDESTRIANS INJURED")
.aggregate_by(method=proportion_pedestrian_injuries, output_column="prop_pedestrian_injuries")
.build()
)),
("enrich_max_injuries", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="NUMBER OF PERSONS INJURED")
.aggregate_by(method=max_injuries, output_column="max_injuries_in_single_collision")
.build()
)),
("enrich_vehicles", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="VEHICLE TYPE CODE 1")
.aggregate_by(method=most_common_vehicle, output_column="most_common_vehicle")
.build()
)),
("enrich_injury_var", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="NUMBER OF PERSONS INJURED")
.aggregate_by(method=injury_variance, output_column="injury_variance")
.build()
)),
("enrich_peak_day", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="CRASH DATE")
.aggregate_by(method=most_common_day, output_column="peak_collision_day")
.build()
)),
("enrich_night_prop", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="CRASH TIME")
.aggregate_by(method=proportion_night_collisions, output_column="prop_night_collisions")
.build()
)),
("enrich_avg_motorist_injuries", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection", values_from="NUMBER OF MOTORIST INJURED")
.aggregate_by(method=average_motorist_injuries, output_column="avg_motorist_injuries_per_collision")
.build()
)),
("enrich_count_collisions", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_intersection")
.count_by(output_column="collision_count")
.build()
)),
("visualiser", (
um.UrbanMapper().visual
.with_type("Interactive")
.with_style({"tiles": "CartoDB dark_matter", "colorbar_text_color": "white"})
.build()
))
])
In [ ]:
Copied!
# Execute the pipeline
mapped_data, enriched_layer = pipeline.compose_transform()
# Execute the pipeline
mapped_data, enriched_layer = pipeline.compose_transform()
In [ ]:
Copied!
# Execute and visualise
fig = pipeline.visualise([
"total_injuries", "total_fatalities", "most_common_factor", "prop_pedestrian_injuries",
"max_injuries_in_single_collision", "most_common_vehicle", "injury_variance",
"peak_collision_day", "prop_night_collisions", "avg_motorist_injuries_per_collision",
"collision_count",
])
fig
# Execute and visualise
fig = pipeline.visualise([
"total_injuries", "total_fatalities", "most_common_factor", "prop_pedestrian_injuries",
"max_injuries_in_single_collision", "most_common_vehicle", "injury_variance",
"peak_collision_day", "prop_night_collisions", "avg_motorist_injuries_per_collision",
"collision_count",
])
fig
In [ ]:
Copied!
# Save the pipeline
pipeline.save("./collisions_advanced_pipeline.dill")
# Save the pipeline
pipeline.save("./collisions_advanced_pipeline.dill")
In [ ]:
Copied!
# Export the pipeline to JupyterGIS for collaborative exploration
pipeline.to_jgis(
filepath="collision_exploration.JGIS",
urban_layer_name="Collisions Intersections Information",
raise_on_existing=False,
)
# Export the pipeline to JupyterGIS for collaborative exploration
pipeline.to_jgis(
filepath="collision_exploration.JGIS",
urban_layer_name="Collisions Intersections Information",
raise_on_existing=False,
)
In [ ]:
Copied!