Remarkable Trees Paris – Advanced Pipeline¶
This notebook explores the distribution and characteristics of remarkable trees across Paris neighborhoods using data from Paris Open Data. The dataset, created in 2006 by the Direction des Espaces Verts et de l'Environnement - Ville de Paris, includes geo-located remarkable trees found in diverse locations such as gardens, cemeteries, streets, schools, and early childhood institutions. These trees are notable for their age, size, rarity, or historical significance.
The study maps these trees to their respective neighborhoods (quartiers) and enriches the data with the following neighborhood-level metrics:
- Count of remarkable trees: Total number of remarkable trees per neighborhood.
- Average circumference: Mean circumference of trees (in cm) per neighborhood.
- Average height: Mean height of trees (in meters) per neighborhood.
- Most common genus: The predominant tree genus in each neighborhood.
- Oldest plantation date: The earliest recorded plantation date per neighborhood.
- Summary of resumes: An LLM-generated summary (in English) of the combined 'Résumé' (summary notes) for all remarkable trees in each neighborhood.
- Summary of descriptions: An LLM-generated summary (in English) of the combined 'Descriptif' (detailed descriptions) for all remarkable trees in each neighborhood.
Through this pipeline, the notebook processes the data, applies spatial filters, and visualises the enriched metrics on interactive maps, offering insights into how remarkable trees are distributed and characterized across Paris.
#####################################################################################
# ⚠️ INFORMATION ABOUT THE CURRENT CELL ⚠️
# The following shows custom aggregation functions
# used later on in the pipeline
# Make sure to export your OPEN AI key as an env of your terminal's instance.
#####################################################################################
import pandas as pd
import ell
def most_common_genre(series):
if series.empty:
return None
mode = series.mode()
return mode.iloc[0] if not mode.empty else None
def oldest_plantation_date(series):
if series.empty:
return None
if not pd.api.types.is_datetime64_any_dtype(series):
try:
series = pd.to_datetime(series, errors='coerce', utc=True)
except Exception as e:
raise ValueError(f"Could not convert series to datetime: {e}")
return series.min()
@ell.simple(model="gpt-4")
def summarize_texts(texts: str):
"""You are a urban planner expert and to write summarisation text for urban offices of city councils."""
return f"Résumez les textes suivants de manière très concise, output tout en Anglais s'il te plait :\n\n{texts}"
def summarize_resumes(series):
if series.empty:
return None
combined_text = " ".join(series)
try:
summary = summarize_texts(combined_text)
return summary
except Exception as e:
print(f"Error generating summary: {e}")
return "Summary unavailable"
WARNING: No API key found for model `gpt-4` using client `str` at time of definition. Can be okay if custom client specified later! https://docs.ell.so/core_concepts/models_and_api_clients.html
#####################################################################################
# ⚠️ INFORMATION ABOUT THE CURRENT CELL ⚠️
# Some data wrangling are necessary due to the raw data being not
# computable enough hence the "manual" load to create a pre-processed
# version of the dataset
#####################################################################################
from urban_mapper import CSVLoader
import urban_mapper
file_path = "./arbresremarquablesparis.csv"
df = CSVLoader(file_path, "idbase", "idbase", separator=";")._load_data_from_file()
df[['latitude', 'longitude']] = df['Geo point'].str.split(',', expand=True)
df['latitude'] = df['latitude'].str.strip().astype(float)
df['longitude'] = df['longitude'].str.strip().astype(float)
df.drop(columns=["Geo point"], axis=1, inplace=True)
df.to_parquet("./trees_paris.parquet")
mapper = urban_mapper.UrbanMapper()
mapper.table_vis.interactive_display(df)
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[2], line 10 1 ##################################################################################### 2 3 # ⚠️ INFORMATION ABOUT THE CURRENT CELL ⚠️ (...) 7 8 ##################################################################################### ---> 10 from urban_mapper import CSVLoader 11 import urban_mapper 13 file_path = "./arbresremarquablesparis.csv" File ~/checkouts/readthedocs.org/user_builds/urbanmapper/checkouts/70/src/urban_mapper/__init__.py:3 1 from loguru import logger ----> 3 from .mixins import ( 4 LoaderMixin, 5 EnricherMixin, 6 VisualMixin, 7 TableVisMixin, 8 AuctusSearchMixin, 9 PipelineGeneratorMixin, 10 UrbanPipelineMixin, 11 ) 12 from .modules import ( 13 LoaderBase, 14 CSVLoader, (...) 30 PipelineGeneratorFactory, 31 ) 33 from .urban_mapper import UrbanMapper File ~/checkouts/readthedocs.org/user_builds/urbanmapper/checkouts/70/src/urban_mapper/mixins/__init__.py:1 ----> 1 from .loader import LoaderMixin 2 from .enricher import EnricherMixin 3 from .visual import VisualMixin File ~/checkouts/readthedocs.org/user_builds/urbanmapper/checkouts/70/src/urban_mapper/mixins/loader.py:1 ----> 1 from urban_mapper.modules.loader.loader_factory import LoaderFactory 4 class LoaderMixin(LoaderFactory): 5 def __init__(self): File ~/checkouts/readthedocs.org/user_builds/urbanmapper/checkouts/70/src/urban_mapper/modules/__init__.py:1 ----> 1 from .loader import LoaderBase, CSVLoader, ShapefileLoader, ParquetLoader 2 from .imputer import ( 3 GeoImputerBase, 4 SimpleGeoImputer, 5 AddressGeoImputer, 6 ) 7 from .filter import ( 8 GeoFilterBase, 9 BoundingBoxFilter, 10 ) File ~/checkouts/readthedocs.org/user_builds/urbanmapper/checkouts/70/src/urban_mapper/modules/loader/__init__.py:3 1 from .abc_loader import LoaderBase 2 from .loaders import CSVLoader, ShapefileLoader, ParquetLoader ----> 3 from .loader_factory import LoaderFactory 5 __all__ = [ 6 "LoaderBase", 7 "CSVLoader", (...) 10 "LoaderFactory", 11 ] File ~/checkouts/readthedocs.org/user_builds/urbanmapper/checkouts/70/src/urban_mapper/modules/loader/loader_factory.py:19 17 from urban_mapper.modules.loader.loaders.csv_loader import CSVLoader 18 from urban_mapper.modules.loader.loaders.parquet_loader import ParquetLoader ---> 19 from urban_mapper.modules.loader.loaders.raster_loader import RasterLoader # Importing RasterLoader of the new raster loader module 20 from urban_mapper.modules.loader.loaders.shapefile_loader import ShapefileLoader 21 from urban_mapper.utils import require_attributes File ~/checkouts/readthedocs.org/user_builds/urbanmapper/checkouts/70/src/urban_mapper/modules/loader/loaders/raster_loader.py:2 1 from ..abc_loader import LoaderBase ----> 2 import rasterio 3 from typing import Any 4 import numpy as np ModuleNotFoundError: No module named 'rasterio'
from urban_mapper.pipeline import UrbanPipeline
import urban_mapper as um
pipeline = UrbanPipeline([
("urban_layer", (
um.UrbanMapper().urban_layer
.with_type("region_neighborhoods")
.from_place("Paris, France")
.with_mapping(
longitude_column="longitude",
latitude_column="latitude",
output_column="nearest_quartier"
)
.build()
)),
("loader", (
um.UrbanMapper().loader
.from_file("./trees_paris.parquet")
.with_columns(longitude_column="longitude", latitude_column="latitude")
.build()
)),
("filter", um.UrbanMapper().filter.with_type("BoundingBoxFilter").build()),
("enrich_trees_count", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_quartier")
.count_by(output_column="ramarquable_trees_count")
.build()
)),
("enrich_avg_circonference", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_quartier", values_from="circonference en cm")
.aggregate_by(method="mean", output_column="avg_circonference")
.build()
)),
("enrich_avg_hauteur", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_quartier", values_from="hauteur en m")
.aggregate_by(method="mean", output_column="avg_hauteur")
.build()
)),
("enrich_most_common_genre", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_quartier", values_from="genre")
.aggregate_by(method=most_common_genre, output_column="most_common_genre")
.build()
)),
("enrich_oldest_plantation", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_quartier", values_from="date de plantation")
.aggregate_by(method=oldest_plantation_date, output_column="oldest_plantation_date")
.build()
)),
("enrich_resume_summary", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_quartier", values_from="Résumé")
.aggregate_by(method=summarize_resumes, output_column="resume_summary")
.build()
)),
("enrich_description_summary", (
um.UrbanMapper().enricher
.with_data(group_by="nearest_quartier", values_from="Descriptif")
.aggregate_by(method=summarize_resumes, output_column="descriptif_summary")
.build()
)),
("visualiser", (
um.UrbanMapper().visual
.with_type("Interactive")
.with_style({
"tiles": "CartoDB dark_matter",
"tooltip": [
"ramarquable_trees_count",
"avg_circonference",
"avg_hauteur",
"most_common_genre",
"oldest_plantation_date",
"resume_summary",
"descriptif_summary",
"name"
],
"colorbar_text_color": "white",
})
.build()
))
])
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[3], line 1 ----> 1 from urban_mapper.pipeline import UrbanPipeline 2 import urban_mapper as um 4 pipeline = UrbanPipeline([ 5 ("urban_layer", ( 6 um.UrbanMapper().urban_layer (...) 83 )) 84 ]) File ~/checkouts/readthedocs.org/user_builds/urbanmapper/checkouts/70/src/urban_mapper/__init__.py:3 1 from loguru import logger ----> 3 from .mixins import ( 4 LoaderMixin, 5 EnricherMixin, 6 VisualMixin, 7 TableVisMixin, 8 AuctusSearchMixin, 9 PipelineGeneratorMixin, 10 UrbanPipelineMixin, 11 ) 12 from .modules import ( 13 LoaderBase, 14 CSVLoader, (...) 30 PipelineGeneratorFactory, 31 ) 33 from .urban_mapper import UrbanMapper File ~/checkouts/readthedocs.org/user_builds/urbanmapper/checkouts/70/src/urban_mapper/mixins/__init__.py:1 ----> 1 from .loader import LoaderMixin 2 from .enricher import EnricherMixin 3 from .visual import VisualMixin File ~/checkouts/readthedocs.org/user_builds/urbanmapper/checkouts/70/src/urban_mapper/mixins/loader.py:1 ----> 1 from urban_mapper.modules.loader.loader_factory import LoaderFactory 4 class LoaderMixin(LoaderFactory): 5 def __init__(self): File ~/checkouts/readthedocs.org/user_builds/urbanmapper/checkouts/70/src/urban_mapper/modules/__init__.py:1 ----> 1 from .loader import LoaderBase, CSVLoader, ShapefileLoader, ParquetLoader 2 from .imputer import ( 3 GeoImputerBase, 4 SimpleGeoImputer, 5 AddressGeoImputer, 6 ) 7 from .filter import ( 8 GeoFilterBase, 9 BoundingBoxFilter, 10 ) File ~/checkouts/readthedocs.org/user_builds/urbanmapper/checkouts/70/src/urban_mapper/modules/loader/__init__.py:3 1 from .abc_loader import LoaderBase 2 from .loaders import CSVLoader, ShapefileLoader, ParquetLoader ----> 3 from .loader_factory import LoaderFactory 5 __all__ = [ 6 "LoaderBase", 7 "CSVLoader", (...) 10 "LoaderFactory", 11 ] File ~/checkouts/readthedocs.org/user_builds/urbanmapper/checkouts/70/src/urban_mapper/modules/loader/loader_factory.py:19 17 from urban_mapper.modules.loader.loaders.csv_loader import CSVLoader 18 from urban_mapper.modules.loader.loaders.parquet_loader import ParquetLoader ---> 19 from urban_mapper.modules.loader.loaders.raster_loader import RasterLoader # Importing RasterLoader of the new raster loader module 20 from urban_mapper.modules.loader.loaders.shapefile_loader import ShapefileLoader 21 from urban_mapper.utils import require_attributes File ~/checkouts/readthedocs.org/user_builds/urbanmapper/checkouts/70/src/urban_mapper/modules/loader/loaders/raster_loader.py:2 1 from ..abc_loader import LoaderBase ----> 2 import rasterio 3 from typing import Any 4 import numpy as np ModuleNotFoundError: No module named 'rasterio'
# Execute the pipeline
mapped_data, enriched_layer = pipeline.compose_transform()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[4], line 2 1 # Execute the pipeline ----> 2 mapped_data, enriched_layer = pipeline.compose_transform() NameError: name 'pipeline' is not defined
# Visualise the enriched metrics
fig = pipeline.visualise([
"ramarquable_trees_count",
"avg_circonference",
"avg_hauteur",
"most_common_genre",
"oldest_plantation_date",
"resume_summary",
"descriptif_summary",
])
fig
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[5], line 2 1 # Visualise the enriched metrics ----> 2 fig = pipeline.visualise([ 3 "ramarquable_trees_count", 4 "avg_circonference", 5 "avg_hauteur", 6 "most_common_genre", 7 "oldest_plantation_date", 8 "resume_summary", 9 "descriptif_summary", 10 ]) 12 fig NameError: name 'pipeline' is not defined
# Save the pipeline
pipeline.save("./remarquable_trees_paris.dill")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[6], line 2 1 # Save the pipeline ----> 2 pipeline.save("./remarquable_trees_paris.dill") NameError: name 'pipeline' is not defined
# Export the pipeline to JupyterGIS for collaborative exploration
pipeline.to_jgis(
filepath="remarquable_trees_paris_with_llm.JGIS",
urban_layer_name="Remarquable Trees In paris analysis",
)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[7], line 2 1 # Export the pipeline to JupyterGIS for collaborative exploration ----> 2 pipeline.to_jgis( 3 filepath="remarquable_trees_paris_with_llm.JGIS", 4 urban_layer_name="Remarquable Trees In paris analysis", 5 ) NameError: name 'pipeline' is not defined