EU Restaurants Study – Easy Pipeline¶
Data Source:
In [ ]:
Copied!
#####################################################################################
# ⚠️ INFORMATION ABOUT THE CURRENT CELL ⚠️
# Some data wrangling are necessary due to the raw data being not
# computable enough hence the "manual" load to create a pre-processed
# version of the dataset
#####################################################################################
from urban_mapper import CSVLoader
import urban_mapper
file_path = "./tripadvisor_european_restaurants.csv"
df = CSVLoader(file_path, latitude_column="latitude", longitude_column="longitude")._load_data_from_file()
df = df.reset_index(drop=True)
df = df .loc[:,~df.columns.duplicated()]
print(f"df Duplicated Indexs: {df.index.duplicated().sum()}")
df.to_parquet("./tripadvisor_european_restaurants.parquet")
mapper = urban_mapper.UrbanMapper()
mapper.table_vis.interactive_display(df)
#####################################################################################
# ⚠️ INFORMATION ABOUT THE CURRENT CELL ⚠️
# Some data wrangling are necessary due to the raw data being not
# computable enough hence the "manual" load to create a pre-processed
# version of the dataset
#####################################################################################
from urban_mapper import CSVLoader
import urban_mapper
file_path = "./tripadvisor_european_restaurants.csv"
df = CSVLoader(file_path, latitude_column="latitude", longitude_column="longitude")._load_data_from_file()
df = df.reset_index(drop=True)
df = df .loc[:,~df.columns.duplicated()]
print(f"df Duplicated Indexs: {df.index.duplicated().sum()}")
df.to_parquet("./tripadvisor_european_restaurants.parquet")
mapper = urban_mapper.UrbanMapper()
mapper.table_vis.interactive_display(df)
In [ ]:
Copied!
import pandas as pd
from typing import Optional
def no_yes_prop(series: pd.Series) -> Optional[float]:
if series.empty:
return None
valid_series = series.str.upper().isin(['Y', 'N'])
if not valid_series.any():
return None
proportion = (series.str.upper() == 'Y').mean()
return proportion
def most_frequent_city(series: pd.Series) -> Optional[str]:
if series.empty:
return None
valid_series = series[series.notna() & series.apply(lambda x: isinstance(x, str))]
if valid_series.empty:
return None
mode = valid_series.mode()
return mode.iloc[0] if not mode.empty else None
import pandas as pd
from typing import Optional
def no_yes_prop(series: pd.Series) -> Optional[float]:
if series.empty:
return None
valid_series = series.str.upper().isin(['Y', 'N'])
if not valid_series.any():
return None
proportion = (series.str.upper() == 'Y').mean()
return proportion
def most_frequent_city(series: pd.Series) -> Optional[str]:
if series.empty:
return None
valid_series = series[series.notna() & series.apply(lambda x: isinstance(x, str))]
if valid_series.empty:
return None
mode = valid_series.mode()
return mode.iloc[0] if not mode.empty else None
In [ ]:
Copied!
from urban_mapper.pipeline import UrbanPipeline
import urban_mapper as um
pipeline = UrbanPipeline([
("urban_layer", (
um.UrbanMapper().urban_layer
.with_type("custom_urban_layer")
.from_file("./Europe GeoJSON.geojson")
.with_mapping(
longitude_column="temporary_longitude",
latitude_column="temporary_latitude",
output_column="nearest_country"
)
.build()
)),
("loader", (
um.UrbanMapper().loader
.from_file("./tripadvisor_european_restaurants.parquet")
.with_columns(longitude_column="longitude", latitude_column="latitude")
.build()
)),
("impute", (
um.UrbanMapper().imputer
.with_type("SimpleGeoImputer")
.on_columns("longitude", "latitude")
.build()
)),
("filter", um.UrbanMapper().filter.with_type("BoundingBoxFilter").build()),
("enrich_restaurants_count", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country")
.count_by(output_column="restaurants_count")
.build()
)),
("enrich_vegetarian_friendly", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country", values_from="vegetarian_friendly")
.aggregate_by(method=no_yes_prop, output_column="vegetarian_prop")
.build()
)),
("enrich_vegan_options", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country", values_from="vegan_options")
.aggregate_by(method=no_yes_prop, output_column="vegan_options_prop")
.build()
)),
("enrich_gluten-free", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country", values_from="gluten_free")
.aggregate_by(method=no_yes_prop, output_column="gluten_free_prop")
.build()
)),
("enrich_open_days_per_week", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country", values_from="open_days_per_week")
.aggregate_by(method="mean", output_column="open_days_per_week_avg")
.build()
)),
("enrich_avg_rating", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country", values_from="avg_rating")
.aggregate_by(method="mean", output_column="overall_avg_rating")
.build()
)),
("enrich_total_reviews_per_count", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country", values_from="total_reviews_count")
.aggregate_by(method="mean", output_column="total_reviews_count_avg")
.build()
)),
("enrich_most_frequent_city", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country", values_from="city")
.aggregate_by(method=most_frequent_city, output_column="most_frequent_city")
.build()
)),
("visualiser", (
um.UrbanMapper().visual
.with_type("Interactive")
.with_style({
"tiles": "CartoDB dark_matter",
"colorbar_text_color": "white",
})
.build()
))
])
from urban_mapper.pipeline import UrbanPipeline
import urban_mapper as um
pipeline = UrbanPipeline([
("urban_layer", (
um.UrbanMapper().urban_layer
.with_type("custom_urban_layer")
.from_file("./Europe GeoJSON.geojson")
.with_mapping(
longitude_column="temporary_longitude",
latitude_column="temporary_latitude",
output_column="nearest_country"
)
.build()
)),
("loader", (
um.UrbanMapper().loader
.from_file("./tripadvisor_european_restaurants.parquet")
.with_columns(longitude_column="longitude", latitude_column="latitude")
.build()
)),
("impute", (
um.UrbanMapper().imputer
.with_type("SimpleGeoImputer")
.on_columns("longitude", "latitude")
.build()
)),
("filter", um.UrbanMapper().filter.with_type("BoundingBoxFilter").build()),
("enrich_restaurants_count", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country")
.count_by(output_column="restaurants_count")
.build()
)),
("enrich_vegetarian_friendly", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country", values_from="vegetarian_friendly")
.aggregate_by(method=no_yes_prop, output_column="vegetarian_prop")
.build()
)),
("enrich_vegan_options", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country", values_from="vegan_options")
.aggregate_by(method=no_yes_prop, output_column="vegan_options_prop")
.build()
)),
("enrich_gluten-free", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country", values_from="gluten_free")
.aggregate_by(method=no_yes_prop, output_column="gluten_free_prop")
.build()
)),
("enrich_open_days_per_week", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country", values_from="open_days_per_week")
.aggregate_by(method="mean", output_column="open_days_per_week_avg")
.build()
)),
("enrich_avg_rating", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country", values_from="avg_rating")
.aggregate_by(method="mean", output_column="overall_avg_rating")
.build()
)),
("enrich_total_reviews_per_count", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country", values_from="total_reviews_count")
.aggregate_by(method="mean", output_column="total_reviews_count_avg")
.build()
)),
("enrich_most_frequent_city", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_country", values_from="city")
.aggregate_by(method=most_frequent_city, output_column="most_frequent_city")
.build()
)),
("visualiser", (
um.UrbanMapper().visual
.with_type("Interactive")
.with_style({
"tiles": "CartoDB dark_matter",
"colorbar_text_color": "white",
})
.build()
))
])
In [ ]:
Copied!
# Execute the pipeline
mapped_data, enriched_layer = pipeline.compose_transform()
# Execute the pipeline
mapped_data, enriched_layer = pipeline.compose_transform()
In [ ]:
Copied!
# Visualise the enriched metrics
fig = pipeline.visualise([
"restaurants_count",
"vegetarian_prop",
"vegan_options_prop",
"gluten_free_prop",
"open_days_per_week_avg",
"overall_avg_rating",
"total_reviews_count_avg",
"most_frequent_city"
])
fig
# Visualise the enriched metrics
fig = pipeline.visualise([
"restaurants_count",
"vegetarian_prop",
"vegan_options_prop",
"gluten_free_prop",
"open_days_per_week_avg",
"overall_avg_rating",
"total_reviews_count_avg",
"most_frequent_city"
])
fig
In [ ]:
Copied!
# Save the pipeline
pipeline.save("./EU_restaurant_counts.dill")
# Save the pipeline
pipeline.save("./EU_restaurant_counts.dill")
In [ ]:
Copied!
# Export the pipeline to JupyterGIS for collaborative exploration
pipeline.to_jgis(
filepath="EU_restaurant_counts.JGIS",
urban_layer_name="European Union Restaurants Analysis",
)
# Export the pipeline to JupyterGIS for collaborative exploration
pipeline.to_jgis(
filepath="EU_restaurant_counts.JGIS",
urban_layer_name="European Union Restaurants Analysis",
)
In [ ]:
Copied!
In [ ]:
Copied!