{
  "version": "2026-03-26",
  "generated_at": "2026-03-27T02:47:25.438772+00:00",
  "canonical_files": [
    "scripts/etl_cities.py",
    "scripts/load_city_data.sql",
    "scripts/load_trees_data.sql",
    "scripts/qc_validate.py",
    "backend/migrations/20241212000000_init_db.sql",
    "backend/migrations/20241212120000_add_carbon_benefit.sql",
    "backend/migrations/20250126000000_add_species_attributes.sql",
    "backend/migrations/20260315000000_add_city_column.sql",
    "backend/migrations/20260321000000_add_provenance_fields.sql",
    "backend/src/repository.rs"
  ],
  "database": {
    "table": "trees",
    "documented_scope": "Tree inventory lineage and transformations currently represented in the Rangifer repository.",
    "normalized_csv_columns": [
      "city",
      "species",
      "common_name",
      "borough",
      "diameter",
      "latitude",
      "longitude",
      "carbon_kg",
      "total_annual_benefit_cad",
      "is_invasive",
      "is_native",
      "species_status",
      "planting_year",
      "age",
      "growth_rate_cm_yr",
      "confidence",
      "source",
      "method",
      "dataset_version"
    ],
    "legacy_montreal_csv_columns": [
      "inv_type",
      "emp_no",
      "borough",
      "common_name",
      "species",
      "diameter",
      "latitude",
      "longitude",
      "carbon_kg",
      "total_annual_benefit_cad",
      "is_invasive",
      "is_native",
      "species_status",
      "planting_year",
      "age",
      "growth_rate_cm_yr"
    ],
    "validation_constants": {
      "diameter_min_cm": 0.5,
      "diameter_max_cm": 300,
      "age_max_years": 300,
      "growth_rate_max_cm_yr": 5.0,
      "carbon_max_kg": 50000,
      "age_diameter_rule": "If diameter_cm > age * 5, treat age as implausible and null age, planting_year, and growth_rate.",
      "carbon_coefficients": {
        "hardwood": {
          "b0": -2.0773,
          "b1": 2.3323
        },
        "softwood": {
          "b0": -2.5356,
          "b1": 2.4349
        }
      },
      "annual_benefit_model": {
        "annual_carbon_seq_kg_cap": 25.0,
        "carbon_value_per_kg_cad": 0.05,
        "other_benefits_formula": "(diameter_cm / 100.0) ** 1.5 * 50.0"
      }
    },
    "load_filters": [
      {
        "rule": "Latitude and longitude must both be present before insert.",
        "reason": "The trees table stores geometry, so rows without coordinates are rejected.",
        "implemented_in": [
          "scripts/load_city_data.sql",
          "scripts/load_trees_data.sql"
        ]
      },
      {
        "rule": "Coordinates must fall inside the Canada-focused load envelope.",
        "reason": "Rejects obviously malformed points before geometry creation.",
        "implemented_in": [
          "scripts/load_city_data.sql",
          "scripts/load_trees_data.sql"
        ]
      },
      {
        "rule": "Diameter must be null or between 0.5 and 300 cm.",
        "reason": "Blocks implausible DBH values and measurement errors.",
        "implemented_in": [
          "scripts/etl_cities.py::validate_row",
          "scripts/load_city_data.sql",
          "scripts/load_trees_data.sql"
        ]
      },
      {
        "rule": "Carbon must be null or between 0 and 50000 kg.",
        "reason": "Caps obviously invalid derived biomass values before persistence.",
        "implemented_in": [
          "scripts/etl_cities.py::validate_row",
          "scripts/load_city_data.sql",
          "scripts/load_trees_data.sql"
        ]
      },
      {
        "rule": "Confidence must be null or between 0 and 1.",
        "reason": "Keeps provenance confidence scores bounded before persistence.",
        "implemented_in": [
          "scripts/load_city_data.sql"
        ]
      },
      {
        "rule": "Planting year is cast to integer only when present and plausible.",
        "reason": "Prevents malformed source dates from turning into misleading ages.",
        "implemented_in": [
          "scripts/etl_cities.py",
          "scripts/load_city_data.sql",
          "scripts/load_trees_data.sql"
        ]
      },
      {
        "rule": "Boolean flags are normalized from 1 or 0 into true or false.",
        "reason": "Keeps the database schema typed even though ETL output is CSV.",
        "implemented_in": [
          "scripts/load_city_data.sql",
          "scripts/load_trees_data.sql"
        ]
      }
    ],
    "frontend_contract": {
      "primary_pages": [
        "/data",
        "/methodology"
      ],
      "public_json_path": "/tree-documentation.json",
      "recommended_sections": [
        "Database field ledger",
        "City normalization notes",
        "Validation and audit controls",
        "Known gaps and documentation boundaries"
      ],
      "snapshot_file": "rangifer_fe/src/data/content/tree-documentation.json"
    },
    "known_gaps": [
      {
        "title": "Montreal upstream lineage is only partially captured in repo",
        "impact": "The current repo can document the SQL load into trees, but not every upstream transformation that created arbres-publics-final-analysis.csv.",
        "recommended_next_step": "Move the Montreal pre-processing logic into versioned ETL code or preserve a field-level manifest alongside the generated CSV."
      },
      {
        "title": "Height and condition are legacy-only fields in the current city ETL path",
        "impact": "These fields remain in the schema and API but are not populated by scripts/etl_cities.py.",
        "recommended_next_step": "Either document them as Montreal-only legacy fields, or extend city ETL coverage before exposing them as cross-city metrics."
      },
      {
        "title": "Victoria is a product priority but not part of the current tree ETL script",
        "impact": "Documentation generated from the current tree pipeline does not yet cover Victoria row transformations.",
        "recommended_next_step": "Add Victoria to scripts/etl_cities.py before expanding the public tree-table methodology beyond the current six-city scope."
      }
    ]
  },
  "pipeline_steps": [
    {
      "step": "Extract",
      "summary": "Download or load municipal tree inventories into city-specific ETL functions."
    },
    {
      "step": "Normalize",
      "summary": "Map heterogeneous source fields into a unified CSV schema with WGS84 coordinates and centimeter diameters."
    },
    {
      "step": "Derive",
      "summary": "Compute age, carbon, annual benefit, growth rate, and species classification fields."
    },
    {
      "step": "Validate",
      "summary": "Clean implausible values in Python, then enforce additional range checks during SQL load."
    },
    {
      "step": "Publish",
      "summary": "Persist rows to PostgreSQL/PostGIS and surface the documented fields through API, MCP, and frontend pages."
    }
  ],
  "normalized_input_fields": [
    {
      "field": "city",
      "type": "text",
      "description": "Normalized municipality label written by the ETL before load."
    },
    {
      "field": "species",
      "type": "text",
      "description": "Best available scientific or municipal species identifier after normalization."
    },
    {
      "field": "common_name",
      "type": "text",
      "description": "Display-friendly common name when the source provides one."
    },
    {
      "field": "borough",
      "type": "text",
      "description": "Ward, borough, neighbourhood, or equivalent local administrative area."
    },
    {
      "field": "diameter",
      "type": "float",
      "description": "Diameter at breast height in centimeters after unit normalization."
    },
    {
      "field": "latitude",
      "type": "float",
      "description": "WGS84 latitude used to build the PostGIS point."
    },
    {
      "field": "longitude",
      "type": "float",
      "description": "WGS84 longitude used to build the PostGIS point."
    },
    {
      "field": "carbon_kg",
      "type": "float",
      "description": "Derived carbon storage estimate before SQL load filters."
    },
    {
      "field": "total_annual_benefit_cad",
      "type": "float",
      "description": "Derived annual benefit estimate before SQL load filters."
    },
    {
      "field": "is_invasive",
      "type": "int",
      "description": "0 or 1 flag that becomes a boolean in PostgreSQL."
    },
    {
      "field": "is_native",
      "type": "int",
      "description": "0 or 1 flag that becomes a boolean in PostgreSQL."
    },
    {
      "field": "species_status",
      "type": "text",
      "description": "Derived status label: Native, Invasive, or Other."
    },
    {
      "field": "planting_year",
      "type": "float",
      "description": "Source year value before integer casting during load."
    },
    {
      "field": "age",
      "type": "float",
      "description": "Derived age when planting year is available and plausible."
    },
    {
      "field": "growth_rate_cm_yr",
      "type": "float",
      "description": "Derived annual diameter growth estimate when age is available."
    },
    {
      "field": "confidence",
      "type": "float",
      "description": "Row-level confidence score derived from record completeness after ETL validation."
    },
    {
      "field": "source",
      "type": "text",
      "description": "Dataset-specific source label assigned by the ETL before load."
    },
    {
      "field": "method",
      "type": "text",
      "description": "Collection method assigned during ETL or legacy SQL load."
    },
    {
      "field": "dataset_version",
      "type": "text",
      "description": "Source snapshot or legacy load vintage assigned before insert."
    }
  ],
  "legacy_montreal_input_fields": [
    {
      "field": "inv_type",
      "type": "text",
      "description": "Legacy Montreal inventory type column preserved in the precomputed CSV."
    },
    {
      "field": "emp_no",
      "type": "text",
      "description": "Legacy Montreal identifier column preserved in the precomputed CSV."
    },
    {
      "field": "borough",
      "type": "text",
      "description": "Montreal borough label from the precomputed CSV."
    },
    {
      "field": "common_name",
      "type": "text",
      "description": "Montreal common name from the precomputed CSV."
    },
    {
      "field": "species",
      "type": "text",
      "description": "Montreal species value from the precomputed CSV."
    },
    {
      "field": "diameter",
      "type": "float",
      "description": "Montreal diameter in centimeters from the precomputed CSV."
    },
    {
      "field": "latitude",
      "type": "float",
      "description": "Montreal latitude used to build PostGIS geometry."
    },
    {
      "field": "longitude",
      "type": "float",
      "description": "Montreal longitude used to build PostGIS geometry."
    },
    {
      "field": "carbon_kg",
      "type": "float",
      "description": "Precomputed Montreal carbon estimate loaded directly into trees."
    },
    {
      "field": "total_annual_benefit_cad",
      "type": "float",
      "description": "Precomputed Montreal annual benefit estimate loaded directly into trees."
    },
    {
      "field": "is_invasive",
      "type": "int",
      "description": "Legacy Montreal 0 or 1 invasive flag."
    },
    {
      "field": "is_native",
      "type": "int",
      "description": "Legacy Montreal 0 or 1 native flag."
    },
    {
      "field": "species_status",
      "type": "text",
      "description": "Legacy Montreal status label loaded directly into trees."
    },
    {
      "field": "planting_year",
      "type": "float",
      "description": "Legacy Montreal planting year prior to integer cast."
    },
    {
      "field": "age",
      "type": "float",
      "description": "Legacy Montreal age value loaded directly into trees."
    },
    {
      "field": "growth_rate_cm_yr",
      "type": "float",
      "description": "Legacy Montreal growth rate value loaded directly into trees."
    }
  ],
  "database_fields": [
    {
      "field": "id",
      "label": "Record ID",
      "category": "system",
      "frontend_visible": false,
      "audit_status": "auditable_in_repo",
      "description": "Surrogate primary key assigned by PostgreSQL during insert.",
      "source_fields": [],
      "transformation_summary": "Generated by SERIAL sequence. No source-side transformation.",
      "formula_or_sql": "id SERIAL PRIMARY KEY",
      "null_behavior": "Never null.",
      "validation_rules": [
        "Enforced by primary key constraint."
      ],
      "implementation_refs": [
        "backend/migrations/20241212000000_init_db.sql"
      ]
    },
    {
      "field": "city",
      "label": "City",
      "category": "normalized_source",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Municipality label normalized by ETL or backfilled to Montreal for legacy rows.",
      "source_fields": [
        "city"
      ],
      "transformation_summary": "Written directly by city ETL output. Legacy Montreal rows were backfilled in migration 20260315000000_add_city_column.sql.",
      "formula_or_sql": "UPDATE trees SET city = 'Montreal' WHERE city IS NULL",
      "null_behavior": "Not null after migration backfill.",
      "validation_rules": [
        "City ETL assigns a fixed city label per source dataset.",
        "Database migration sets existing null cities to Montreal."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py",
        "scripts/load_city_data.sql",
        "backend/migrations/20260315000000_add_city_column.sql"
      ]
    },
    {
      "field": "species",
      "label": "Species",
      "category": "normalized_source",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Best available botanical or municipal species identifier after city-specific cleanup.",
      "source_fields": [
        "species",
        "botanical_name",
        "genus_name",
        "species_name",
        "cultivar_name",
        "common_name"
      ],
      "transformation_summary": "Species strings are normalized per city. When a true botanical name is missing, the ETL falls back to the municipality's species identifier or cleaned common name. SQL load coalesces missing values to 'Unknown'.",
      "formula_or_sql": "COALESCE(species, 'Unknown')",
      "null_behavior": "Missing source values are coerced to 'Unknown' at load time.",
      "validation_rules": [
        "Species is required by the database schema.",
        "Per-city ETL logic reconstructs Vancouver and Calgary cultivar names."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::etl_toronto",
        "scripts/etl_cities.py::etl_vancouver",
        "scripts/etl_cities.py::etl_calgary",
        "scripts/load_city_data.sql"
      ]
    },
    {
      "field": "common_name",
      "label": "Common Name",
      "category": "normalized_source",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Human-readable municipal common name when present.",
      "source_fields": [
        "common_name",
        "species_common",
        "species_common",
        "COMMON_NAME"
      ],
      "transformation_summary": "Names are title-cased in several city ETLs. Toronto common names are reordered from 'Maple, Norway' into 'Norway Maple' when no botanical name exists.",
      "formula_or_sql": "Stored as provided by ETL.",
      "null_behavior": "Remains null when no common name exists.",
      "validation_rules": [
        "No standalone database constraint; quality depends on municipal source completeness."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::etl_toronto",
        "scripts/etl_cities.py::etl_vancouver",
        "scripts/etl_cities.py::etl_edmonton",
        "scripts/etl_cities.py::etl_calgary",
        "scripts/etl_cities.py::etl_winnipeg"
      ]
    },
    {
      "field": "borough",
      "label": "Borough or Ward",
      "category": "normalized_source",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Normalized local administrative area such as borough, ward, neighbourhood, or community code.",
      "source_fields": [
        "borough",
        "ward",
        "site",
        "neighbourhood_name",
        "comm_code",
        "neighbourhood"
      ],
      "transformation_summary": "Mapped per city from the best available neighbourhood or ward field. Toronto converts ward decimals into labels like 'Ward 3'.",
      "formula_or_sql": "Stored as provided by ETL.",
      "null_behavior": "Can remain null if the source has no local area field.",
      "validation_rules": [
        "Used heavily in API filters and analytics; indexes exist on city and borough combinations."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py",
        "backend/migrations/20260315000000_add_city_column.sql",
        "backend/src/repository.rs"
      ]
    },
    {
      "field": "planting_year",
      "label": "Planting Year",
      "category": "normalized_source",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Planting year when it can be reliably extracted from municipal source dates.",
      "source_fields": [
        "plant_date",
        "planted_date",
        "date_planted",
        "active_dt"
      ],
      "transformation_summary": "City ETLs extract the year from raw dates, reject implausible years, and SQL load casts surviving values to integer.",
      "formula_or_sql": "CASE WHEN planting_year IS NOT NULL THEN planting_year::INTEGER ELSE NULL END",
      "null_behavior": "Set to null when missing or outside plausible range.",
      "validation_rules": [
        "Only years between 1800 and the current runtime year are retained by ETL.",
        "Age/diameter mismatch can nullify planting_year during validation."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::validate_row",
        "scripts/load_city_data.sql",
        "scripts/load_trees_data.sql"
      ]
    },
    {
      "field": "age",
      "label": "Age",
      "category": "derived",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Derived tree age in years when planting year is known and plausible.",
      "source_fields": [
        "planting_year",
        "diameter"
      ],
      "transformation_summary": "Age is currently computed only as current_year - planting_year. The ETL does not infer age from diameter when planting year is missing.",
      "formula_or_sql": "age = current_year - planting_year",
      "null_behavior": "Null when planting year is missing, implausible, or invalidated by cross-checks.",
      "validation_rules": [
        "Age must be between 0 and 300 years.",
        "If diameter > age * 5, age is nulled and diameter is kept."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::validate_row",
        "scripts/etl_cities.py",
        "scripts/load_city_data.sql"
      ]
    },
    {
      "field": "height",
      "label": "Height",
      "category": "legacy_partial",
      "frontend_visible": true,
      "audit_status": "partial_in_repo",
      "description": "Legacy height field that remains in the schema but is not populated by the current multi-city ETL path.",
      "source_fields": [],
      "transformation_summary": "Current city ETL does not write height. Existing non-null values come from legacy Montreal data or future loaders.",
      "formula_or_sql": "No active transformation in scripts/etl_cities.py or load_city_data.sql.",
      "null_behavior": "Typically null for multi-city ETL rows.",
      "validation_rules": [
        "No current ETL validation path in repo for height."
      ],
      "implementation_refs": [
        "backend/migrations/20241212000000_init_db.sql",
        "backend/src/repository.rs"
      ]
    },
    {
      "field": "diameter",
      "label": "Diameter",
      "category": "normalized_source",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Diameter at breast height in centimeters after unit cleanup.",
      "source_fields": [
        "diameter",
        "dbh_trunk",
        "diameter_cm",
        "diameter_breast_height",
        "dbh_cm",
        "diameter at breast height"
      ],
      "transformation_summary": "City ETLs normalize DBH into centimeters. Vancouver converts inches to centimeters when only inch-based diameter is available.",
      "formula_or_sql": "diameter_cm = round(diameter_inches * 2.54, 1) when needed",
      "null_behavior": "Set to null when out of plausible range.",
      "validation_rules": [
        "Diameter must be between 0.5 and 300 cm when present.",
        "Invalid diameter nullifies carbon, benefit, and growth_rate during ETL validation."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::validate_row",
        "scripts/etl_cities.py::etl_vancouver",
        "scripts/load_city_data.sql",
        "scripts/load_trees_data.sql"
      ]
    },
    {
      "field": "condition",
      "label": "Condition",
      "category": "legacy_partial",
      "frontend_visible": true,
      "audit_status": "partial_in_repo",
      "description": "Legacy condition field retained in the schema but not populated by the current multi-city ETL loader.",
      "source_fields": [],
      "transformation_summary": "Current city ETL does not output condition, so existing values are legacy-only.",
      "formula_or_sql": "No active transformation in scripts/etl_cities.py or load_city_data.sql.",
      "null_behavior": "Typically null for current non-Montreal city loads.",
      "validation_rules": [
        "No active ETL validation path in repo for condition."
      ],
      "implementation_refs": [
        "backend/migrations/20241212000000_init_db.sql",
        "backend/src/repository.rs"
      ]
    },
    {
      "field": "carbon_kg",
      "label": "Carbon Storage",
      "category": "derived",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Estimated carbon storage in kilograms using a general allometric biomass model.",
      "source_fields": [
        "diameter",
        "species"
      ],
      "transformation_summary": "The ETL chooses a softwood or hardwood coefficient pair from the species genus, computes above-ground biomass from ln(DBH), then stores carbon as biomass * 0.5.",
      "formula_or_sql": "Softwood genera use b0=-2.5356, b1=2.4349; hardwood genera use b0=-2.0773, b1=2.3323. AGB = exp(b0 + b1 * ln(DBH_cm)); carbon_kg = round(AGB * 0.5, 2)",
      "null_behavior": "Null when diameter is missing, non-positive, outside range, or yields overflow.",
      "validation_rules": [
        "Carbon is recalculated when diameter survives but derived values were nulled.",
        "Carbon above 50000 kg is rejected and nulled."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::estimate_carbon_kg",
        "scripts/etl_cities.py::validate_row",
        "backend/migrations/20241212120000_add_carbon_benefit.sql"
      ]
    },
    {
      "field": "total_annual_benefit_cad",
      "label": "Annual Benefit",
      "category": "derived",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Rough annual ecosystem benefit estimate in CAD.",
      "source_fields": [
        "diameter",
        "carbon_kg"
      ],
      "transformation_summary": "The ETL uses a simplified proxy formula: annual carbon value plus a size-based benefit curve for stormwater, air quality, and energy effects.",
      "formula_or_sql": "annual_carbon_seq = min(diameter_cm * 0.3, 25.0); carbon_value = annual_carbon_seq * 0.05; other_benefits = (diameter_cm / 100.0) ** 1.5 * 50.0; total = round(carbon_value + other_benefits, 2)",
      "null_behavior": "Null when diameter is invalid or carbon is invalidated.",
      "validation_rules": [
        "Annual benefit is nulled whenever diameter or carbon fails validation."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::estimate_annual_benefit_cad",
        "scripts/etl_cities.py::validate_row",
        "backend/migrations/20241212120000_add_carbon_benefit.sql"
      ]
    },
    {
      "field": "location",
      "label": "Location",
      "category": "derived_geometry",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "PostGIS point in EPSG:4326 built from normalized longitude and latitude.",
      "source_fields": [
        "longitude",
        "latitude"
      ],
      "transformation_summary": "SQL load converts normalized WGS84 coordinates into geometry using ST_MakePoint and ST_SetSRID.",
      "formula_or_sql": "ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)",
      "null_behavior": "Rows without coordinates are rejected before insert.",
      "validation_rules": [
        "Per-city ETLs reject coordinates outside city bounds before CSV write.",
        "Load SQL enforces a Canada-focused coordinate envelope before geometry creation."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py",
        "scripts/load_city_data.sql",
        "scripts/load_trees_data.sql"
      ]
    },
    {
      "field": "created_at",
      "label": "Created At",
      "category": "system",
      "frontend_visible": false,
      "audit_status": "auditable_in_repo",
      "description": "Insert timestamp assigned by PostgreSQL.",
      "source_fields": [],
      "transformation_summary": "Database default uses CURRENT_TIMESTAMP at insert time.",
      "formula_or_sql": "created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP",
      "null_behavior": "Never null unless explicitly overridden.",
      "validation_rules": [
        "Controlled by database default."
      ],
      "implementation_refs": [
        "backend/migrations/20241212000000_init_db.sql"
      ]
    },
    {
      "field": "is_native",
      "label": "Native Flag",
      "category": "derived_classification",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Boolean native flag based on a static Canada-focused allowlist plus limited genus fallback.",
      "source_fields": [
        "species"
      ],
      "transformation_summary": "classify_species() compares the lowercased species string to NATIVE_SPECIES_CANADA, then falls back to matching the genus token for a small subset.",
      "formula_or_sql": "CASE WHEN is_native = 1 THEN TRUE WHEN is_native = 0 THEN FALSE ELSE NULL END",
      "null_behavior": "Becomes false in ETL output when not matched to the allowlist.",
      "validation_rules": [
        "Classification is deterministic from the static allowlist embedded in etl_cities.py."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::classify_species",
        "scripts/load_city_data.sql",
        "backend/migrations/20250126000000_add_species_attributes.sql"
      ]
    },
    {
      "field": "is_invasive",
      "label": "Invasive Flag",
      "category": "derived_classification",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Boolean invasive flag based on a static Canada-focused invasive species set.",
      "source_fields": [
        "species"
      ],
      "transformation_summary": "classify_species() compares the lowercased species string to INVASIVE_SPECIES_CANADA.",
      "formula_or_sql": "CASE WHEN is_invasive = 1 THEN TRUE WHEN is_invasive = 0 THEN FALSE ELSE NULL END",
      "null_behavior": "Becomes false in ETL output when not matched to the invasive set.",
      "validation_rules": [
        "Classification is deterministic from the static invasive set embedded in etl_cities.py."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::classify_species",
        "scripts/load_city_data.sql",
        "backend/migrations/20250126000000_add_species_attributes.sql"
      ]
    },
    {
      "field": "species_status",
      "label": "Species Status",
      "category": "derived_classification",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Three-way label derived from native and invasive matching.",
      "source_fields": [
        "species",
        "is_native",
        "is_invasive"
      ],
      "transformation_summary": "Native wins first, invasive wins second, otherwise status is Other.",
      "formula_or_sql": "if is_native: Native; elif is_invasive: Invasive; else: Other",
      "null_behavior": "Always populated by current city ETL rows.",
      "validation_rules": [
        "The label is derived from classify_species() and stored in the CSV before load."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::classify_species",
        "scripts/load_city_data.sql",
        "backend/migrations/20250126000000_add_species_attributes.sql"
      ]
    },
    {
      "field": "growth_rate_cm_yr",
      "label": "Growth Rate",
      "category": "derived",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Estimated annual diameter growth rate in centimeters per year.",
      "source_fields": [
        "diameter",
        "age"
      ],
      "transformation_summary": "The ETL computes growth rate as diameter / age when both are present and plausible.",
      "formula_or_sql": "growth_rate_cm_yr = round(diameter_cm / age, 2)",
      "null_behavior": "Null when age is unavailable, invalid, or implies an implausibly fast growth rate.",
      "validation_rules": [
        "Growth rate above 5.0 cm/year is nulled.",
        "If age is invalidated, growth rate is nulled as well."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::estimate_growth_rate",
        "scripts/etl_cities.py::validate_row",
        "backend/migrations/20250126000000_add_species_attributes.sql"
      ]
    },
    {
      "field": "confidence",
      "label": "Confidence",
      "category": "provenance",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Record confidence score intended to reflect how directly observed the row is.",
      "source_fields": [
        "species",
        "common_name",
        "borough",
        "diameter",
        "planting_year"
      ],
      "transformation_summary": "Current city ETL assigns confidence with a row-completeness heuristic after validation. The legacy Montreal load stamps an explicit lower confidence band based on how complete the precomputed CSV row is.",
      "formula_or_sql": "scripts/etl_cities.py::estimate_row_confidence(...) or CASE-based confidence in scripts/load_trees_data.sql",
      "null_behavior": "Current city ETL rows and the legacy Montreal loader both populate confidence explicitly.",
      "validation_rules": [
        "The SQL load rejects confidence values outside the 0 to 1 range.",
        "Rows with missing scientific names, diameter, borough, common name, or planting year are down-weighted by the ETL heuristic."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::estimate_row_confidence",
        "scripts/etl_cities.py::build_provenance",
        "scripts/load_city_data.sql",
        "scripts/load_trees_data.sql",
        "backend/migrations/20260321000000_add_provenance_fields.sql",
        "backend/src/repository.rs"
      ]
    },
    {
      "field": "source",
      "label": "Source",
      "category": "provenance",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Originating dataset label for citation and filtering.",
      "source_fields": [
        "city"
      ],
      "transformation_summary": "Current city ETL maps each city to a dataset-specific source label before export. The legacy Montreal load stamps the precomputed CSV name explicitly during SQL insert.",
      "formula_or_sql": "scripts/etl_cities.py::CITY_PROVENANCE[city]['source'] or 'Montreal Public Trees legacy analysis CSV'",
      "null_behavior": "Current city ETL rows and the legacy Montreal loader both populate source explicitly.",
      "validation_rules": [
        "Current city ETL uses a fixed per-city source mapping so source values stay deterministic."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::CITY_PROVENANCE",
        "scripts/etl_cities.py::build_provenance",
        "scripts/load_city_data.sql",
        "scripts/load_trees_data.sql",
        "backend/migrations/20260321000000_add_provenance_fields.sql",
        "backend/src/repository.rs"
      ]
    },
    {
      "field": "method",
      "label": "Method",
      "category": "provenance",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Collection method enum intended to distinguish survey, lidar, imagery, or citizen observation records.",
      "source_fields": [
        "city"
      ],
      "transformation_summary": "Current city ETL assigns the collection method per dataset before export. The legacy Montreal load also stamps method explicitly during insert.",
      "formula_or_sql": "scripts/etl_cities.py::CITY_PROVENANCE[city]['method'] or 'field_survey'",
      "null_behavior": "Current city ETL rows and the legacy Montreal loader both populate method explicitly.",
      "validation_rules": [
        "Current tree ETL paths use field_survey for the municipal inventory datasets represented in repo."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::CITY_PROVENANCE",
        "scripts/etl_cities.py::build_provenance",
        "scripts/load_city_data.sql",
        "scripts/load_trees_data.sql",
        "backend/migrations/20260321000000_add_provenance_fields.sql",
        "backend/src/repository.rs"
      ]
    },
    {
      "field": "dataset_version",
      "label": "Dataset Version",
      "category": "provenance",
      "frontend_visible": true,
      "audit_status": "auditable_in_repo",
      "description": "Version marker for the source snapshot or load vintage.",
      "source_fields": [],
      "transformation_summary": "Current city ETL stamps each row with the ETL snapshot date. The legacy Montreal load uses a fixed legacy-precomputed label so that the old import path is explicit rather than implicit.",
      "formula_or_sql": "f'snapshot-<YYYY-MM-DD>' in scripts/etl_cities.py or 'legacy-precomputed' in scripts/load_trees_data.sql",
      "null_behavior": "Current city ETL rows and the legacy Montreal loader both populate dataset_version explicitly.",
      "validation_rules": [
        "Current city ETL derives the snapshot label from the ETL runtime date."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::DATASET_SNAPSHOT_DATE",
        "scripts/etl_cities.py::build_provenance",
        "scripts/load_city_data.sql",
        "scripts/load_trees_data.sql",
        "backend/migrations/20260321000000_add_provenance_fields.sql",
        "backend/src/repository.rs"
      ]
    }
  ],
  "city_transformations": [
    {
      "city": "Montreal",
      "status": "legacy_precomputed",
      "source_name": "arbres-publics-final-analysis.csv",
      "source_endpoint": null,
      "lineage_boundary": "The repo captures the SQL load into trees, but the upstream transformations that produced arbres-publics-final-analysis.csv are not fully represented in the current codebase.",
      "raw_column_mapping": {
        "species": [
          "species"
        ],
        "common_name": [
          "common_name"
        ],
        "borough": [
          "borough"
        ],
        "diameter": [
          "diameter"
        ],
        "coordinates": [
          "latitude",
          "longitude"
        ],
        "planting_year": [
          "planting_year"
        ],
        "derived_metrics": [
          "carbon_kg",
          "total_annual_benefit_cad",
          "age",
          "growth_rate_cm_yr",
          "is_native",
          "is_invasive",
          "species_status"
        ]
      },
      "coordinate_logic": "Coordinates are assumed to already be normalized to latitude and longitude columns before load.",
      "species_logic": "Species, common_name, borough, and derived metrics arrive precomputed in the CSV copied by scripts/load_data.sh.",
      "diameter_logic": "Diameter is loaded from the precomputed CSV with SQL range filtering.",
      "borough_logic": "Borough is loaded directly from the precomputed CSV.",
      "planting_year_logic": "Loaded directly from the precomputed CSV, then cast to integer.",
      "skip_rules": [
        "Rows without latitude or longitude are rejected in load_trees_data.sql.",
        "Rows with invalid coordinate, diameter, or carbon values are rejected in load_trees_data.sql."
      ],
      "implementation_refs": [
        "scripts/load_data.sh",
        "scripts/load_trees_data.sql"
      ]
    },
    {
      "city": "Toronto",
      "status": "auditable_in_repo",
      "source_name": "Toronto Street Tree Data",
      "source_endpoint": "https://open.toronto.ca/dataset/street-tree-data/",
      "lineage_boundary": "Fully auditable from download through normalized CSV output.",
      "raw_column_mapping": {
        "species": [
          "botanical_name",
          "common_name"
        ],
        "common_name": [
          "common_name"
        ],
        "borough": [
          "ward",
          "site"
        ],
        "diameter": [
          "dbh_trunk",
          "dbh"
        ],
        "coordinates": [
          "latitude",
          "longitude",
          "y",
          "x",
          "geometry"
        ],
        "planting_year": [
          "plant_date",
          "planted_date"
        ]
      },
      "coordinate_logic": "Reads latitude and longitude directly when available, otherwise parses the geometry string emitted by Toronto's export format.",
      "species_logic": "Uses botanical_name when present. If absent, common names like 'Maple, Norway' are reordered into 'Norway Maple' and reused as the species identifier.",
      "diameter_logic": "Reads DBH_TRUNK directly in centimeters.",
      "borough_logic": "Converts ward decimals into labels like 'Ward 3'; falls back to site when ward is missing.",
      "planting_year_logic": "Extracts year from plant_date or planted_date and handles YYYYMMDD-like numbers.",
      "skip_rules": [
        "Missing coordinates.",
        "Coordinates outside Toronto bounds."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::etl_toronto"
      ]
    },
    {
      "city": "Vancouver",
      "status": "auditable_in_repo",
      "source_name": "Vancouver Public Trees",
      "source_endpoint": "https://opendata.vancouver.ca/explore/dataset/public-trees/",
      "lineage_boundary": "Fully auditable from download through normalized CSV output.",
      "raw_column_mapping": {
        "species": [
          "genus_name",
          "species_name",
          "cultivar_name"
        ],
        "common_name": [
          "common_name"
        ],
        "borough": [
          "neighbourhood_name",
          "neighborhood_name",
          "geo_local_area"
        ],
        "diameter": [
          "diameter_cm",
          "diameter"
        ],
        "coordinates": [
          "geo_point_2d",
          "latitude",
          "longitude"
        ],
        "planting_year": [
          "date_planted"
        ]
      },
      "coordinate_logic": "Parses geo_point_2d as 'lat, lon' and falls back to latitude and longitude columns.",
      "species_logic": "Reconstructs species from genus, species, and cultivar, while dropping placeholder cultivar values such as NONE.",
      "diameter_logic": "Uses diameter_cm when present; otherwise converts diameter in inches to centimeters.",
      "borough_logic": "Uses neighbourhood_name, neighborhood_name, or geo_local_area in order.",
      "planting_year_logic": "Reads the leading year from the YYYYMMDD date_planted field.",
      "skip_rules": [
        "Missing coordinates.",
        "Coordinates outside Vancouver bounds."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::etl_vancouver"
      ]
    },
    {
      "city": "Edmonton",
      "status": "auditable_in_repo",
      "source_name": "Edmonton Tree Inventory",
      "source_endpoint": "https://data.edmonton.ca/",
      "lineage_boundary": "Fully auditable from download through normalized CSV output.",
      "raw_column_mapping": {
        "species": [
          "species_botanical"
        ],
        "common_name": [
          "species_common"
        ],
        "borough": [
          "neighbourhood_name"
        ],
        "diameter": [
          "diameter_breast_height"
        ],
        "coordinates": [
          "latitude",
          "longitude"
        ],
        "planting_year": [
          "planted_date"
        ]
      },
      "coordinate_logic": "Reads latitude and longitude directly from the Socrata export.",
      "species_logic": "Uses species_botanical and title-cases species_common.",
      "diameter_logic": "Reads diameter_breast_height directly in centimeters.",
      "borough_logic": "Uses neighbourhood_name and title-cases the result.",
      "planting_year_logic": "Reads the first four digits from planted_date values like '1990/06/01'.",
      "skip_rules": [
        "Missing coordinates.",
        "Coordinates outside Edmonton bounds."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::etl_edmonton"
      ]
    },
    {
      "city": "Calgary",
      "status": "auditable_in_repo",
      "source_name": "Calgary Tree Inventory",
      "source_endpoint": "https://data.calgary.ca/",
      "lineage_boundary": "Fully auditable from download through normalized CSV output.",
      "raw_column_mapping": {
        "species": [
          "genus",
          "species",
          "cultivar"
        ],
        "common_name": [
          "common_name"
        ],
        "borough": [
          "comm_code"
        ],
        "diameter": [
          "dbh_cm"
        ],
        "coordinates": [
          "point"
        ],
        "planting_year": [
          "active_dt"
        ]
      },
      "coordinate_logic": "Parses WKT-like strings such as POINT (-114.0978 51.0786) from the point column.",
      "species_logic": "Reconstructs species from genus, species, and cultivar fields, then applies botanical-style casing.",
      "diameter_logic": "Reads dbh_cm directly in centimeters.",
      "borough_logic": "Uses comm_code as the best available local area label.",
      "planting_year_logic": "Reads the first four digits from active_dt values like '1962/08/01'.",
      "skip_rules": [
        "Missing coordinates.",
        "Coordinates outside Calgary bounds.",
        "Rows whose asset_type is STUMP."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::etl_calgary"
      ]
    },
    {
      "city": "Winnipeg",
      "status": "auditable_in_repo",
      "source_name": "Winnipeg Tree Inventory",
      "source_endpoint": "https://data.winnipeg.ca/",
      "lineage_boundary": "Fully auditable from download through normalized CSV output.",
      "raw_column_mapping": {
        "species": [
          "botanical name"
        ],
        "common_name": [
          "common name"
        ],
        "borough": [
          "neighbourhood"
        ],
        "diameter": [
          "diameter at breast height"
        ],
        "coordinates": [
          "point"
        ],
        "planting_year": []
      },
      "coordinate_logic": "Parses WKT-like strings such as POINT (-97.04 49.89) from the point column.",
      "species_logic": "Uses the botanical name directly and applies botanical-style casing.",
      "diameter_logic": "Reads 'diameter at breast height' directly in centimeters.",
      "borough_logic": "Uses the neighbourhood field and title-cases the result.",
      "planting_year_logic": "No planting year is currently extracted in this ETL path.",
      "skip_rules": [
        "Missing coordinates.",
        "Coordinates outside Winnipeg bounds."
      ],
      "implementation_refs": [
        "scripts/etl_cities.py::etl_winnipeg"
      ]
    }
  ],
  "validation": {
    "etl_runtime_checks": [
      {
        "name": "Row-level plausibility cleaning",
        "summary": "validate_row() nulls invalid diameter, age, growth, and carbon values instead of dropping the whole record.",
        "implemented_in": [
          "scripts/etl_cities.py::validate_row"
        ]
      },
      {
        "name": "Per-city coordinate bounds",
        "summary": "Each city ETL rejects coordinates outside its municipal envelope before CSV export.",
        "implemented_in": [
          "scripts/etl_cities.py"
        ]
      },
      {
        "name": "Load-time range enforcement",
        "summary": "SQL load scripts reject rows that still fail coordinate, diameter, or carbon sanity checks.",
        "implemented_in": [
          "scripts/load_city_data.sql",
          "scripts/load_trees_data.sql"
        ]
      }
    ],
    "artifact_qc_checks": [
      {
        "name": "Dataset output QC",
        "summary": "qc_validate.py checks feature counts, file size, schema, PMTiles layer names, and broad Canada bounding boxes for generated layers.",
        "implemented_in": [
          "scripts/qc_validate.py",
          "scripts/qc_baseline.json"
        ]
      }
    ],
    "documentation_validation_checks": [
      {
        "name": "Manifest-to-code coverage",
        "summary": "validate_tree_documentation.py confirms documented normalized columns and load targets match ETL and SQL definitions.",
        "implemented_in": [
          "scripts/validate_tree_documentation.py"
        ]
      },
      {
        "name": "Deterministic transformation assertions",
        "summary": "validate_tree_documentation.py exercises species classification, geometry parsing, carbon estimation, and row-cleaning scenarios.",
        "implemented_in": [
          "scripts/validate_tree_documentation.py"
        ]
      }
    ],
    "manual_audit_protocol": [
      "Sample 5 to 10 records per city from the raw municipal export.",
      "Trace those rows through the normalized CSV and confirm species, borough, diameter, planting_year, and coordinates survive as documented.",
      "Recompute carbon_kg and growth_rate_cm_yr for the sample and compare to stored values.",
      "Confirm provenance fields are populated and match the documented ETL or SQL load rules."
    ]
  }
}
