CODE: Additional Feature Engineering

Hot City, Heated Calls:
Understanding Extreme Heat and Quality of Life
Using New York City's 311 and SHAP

Computing POI density, subway distance, and other spatial accessibility metrics.

← 3. Census ACS 5. NLCD Rasters →

NDVI

Code Cell 1
## Module
from pathlib import Path
import geopandas as gpd
import rasterio
from rasterio.mask import mask
import numpy as np
import pandas as pd
Code Cell 2
## File Paths
tracts_path = Path("data/nyc_tracts_2020/nyc_tracts_2020.shp")
NDVI_dir = Path("data/raster/NDVI.tif")
Code Cell 3
# 1. Load NYC census tracts shapefile
tracts = gpd.read_file(tracts_path)

# 2. Open NDVI raster and ensure CRS matches the vector layer
with rasterio.open(NDVI_dir) as src:
    raster_crs = src.crs
    nodata = src.nodata

    # Reproject tracts to match raster CRS if necessary
    if tracts.crs != raster_crs:
        tracts = tracts.to_crs(raster_crs)

    ndvi_means = []

    # 3. Compute zonal mean NDVI for each tract
    for _, row in tracts.iterrows():
        geom = [row.geometry]

        # Mask raster using the tract polygon
        out_image, out_transform = mask(src, geom, crop=True)

        data = out_image[0].astype("float32")

        # Replace NoData values with NaN to avoid affecting mean
        if nodata is not None:
            data[data == nodata] = np.nan

        # Compute mean NDVI for this tract
        mean_ndvi = float(np.nanmean(data))
        ndvi_means.append(mean_ndvi)

# 4. Add NDVI results to GeoDataFrame
tracts["NDVI"] = ndvi_means

tracts.head()
ctlabel borocode boroname ct2020 boroct2020 cdeligibil ntaname nta2020 cdta2020 cdtaname geoid shape_leng shape_area geometry NDVI
0 1 1 Manhattan 000100 1000100 I The Battery-Governors Island-Ellis Island-Libe... MN0191 MN01 MN01 Financial District-Tribeca (CD 1 Equivalent) 36061000100 10833.043929 1.843005e+06 MULTIPOLYGON (((580787.267 4504805.375, 580819... 0.024685
1 14.01 1 Manhattan 001401 1001401 I Lower East Side MN0302 MN03 MN03 Lower East Side-Chinatown (CD 3 Equivalent) 36061001401 5075.332000 1.006117e+06 POLYGON ((585444.188 4507772.701, 585514.711 4... 0.074870
2 14.02 1 Manhattan 001402 1001402 E Lower East Side MN0302 MN03 MN03 Lower East Side-Chinatown (CD 3 Equivalent) 36061001402 4459.156019 1.226206e+06 POLYGON ((585718.928 4508068.700, 585790.344 4... 0.046529
3 18 1 Manhattan 001800 1001800 I Lower East Side MN0302 MN03 MN03 Lower East Side-Chinatown (CD 3 Equivalent) 36061001800 6391.921174 2.399277e+06 POLYGON ((585313.281 4508223.568, 585324.698 4... 0.041547
4 22.01 1 Manhattan 002201 1002201 E Lower East Side MN0302 MN03 MN03 Lower East Side-Chinatown (CD 3 Equivalent) 36061002201 5779.062607 1.740174e+06 POLYGON ((586251.705 4508169.291, 586248.764 4... 0.063541

WCR

Code Cell 4
## File Paths
water_path = Path("data/Water shp/NYC_water.shp")
Code Cell 5
water = gpd.read_file(water_path)

tracts = tracts.to_crs(water.crs)
Code Cell 6
# 1. Compute tract area
tracts["tract_area"] = tracts.geometry.area

# 2. Intersect tracts with water polygons
#    This will create pieces of water polygons clipped by tract boundaries
water_in_tracts = gpd.overlay(
    tracts[["geoid", "geometry"]],
    water[["geometry"]],
    how="intersection"
)

# 3. Compute water area inside each tract
water_in_tracts["water_area"] = water_in_tracts.geometry.area

# 4. Aggregate water area by tract (GEOID)
water_area_by_tract = (
    water_in_tracts
    .groupby("geoid")["water_area"]
    .sum()
)

# 5. Map aggregated water area back to the tracts GeoDataFrame
tracts["water_area"] = tracts["geoid"].map(water_area_by_tract).fillna(0)

# 6. Compute Water Coverage Ratio (WCR = water area / tract area)
tracts["WCR"] = tracts["water_area"] / tracts["tract_area"]

# Check the result
tracts[["geoid", "NDVI", "WCR"]].head()
geoid NDVI WCR
0 36061000100 0.024685 0.017985
1 36061001401 0.074870 0.000000
2 36061001402 0.046529 0.000000
3 36061001800 0.041547 0.000000
4 36061002201 0.063541 0.000000

BD and AH

Code Cell 7
building_path = Path("data/Buildings/geo_export_10da9e2c-833d-4ba4-9fe2-1f999ac16759.shp")
buildings = gpd.read_file(building_path)
Code Cell 8
buildings = buildings.to_crs(water.crs)
buildings.crs
<Projected CRS: EPSG:32618>
Name: WGS 84 / UTM zone 18N
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: Between 78°W and 72°W, northern hemisphere between equator and 84°N, onshore and offshore. Bahamas. Canada - Nunavut; Ontario; Quebec. Colombia. Cuba. Ecuador. Greenland. Haiti. Jamaica. Panama. Turks and Caicos Islands. United States (USA). Venezuela.
- bounds: (-78.0, 0.0, -72.0, 84.0)
Coordinate Operation:
- name: UTM zone 18N
- method: Transverse Mercator
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich
Code Cell 9
# 1. Use height_roo as building height (it is in feet), convert to meters
buildings["bldg_height"] = buildings["height_roo"] * 0.3048

# 2. Compute building footprint area (in m² because CRS is UTM)
buildings["bldg_area"] = buildings.geometry.area

# 3. Compute tract area
tracts["tract_area"] = tracts.geometry.area

# 4. Spatial join — assign buildings to tracts
bldg_in_tracts = gpd.sjoin(
    buildings[["bldg_height", "bldg_area", "geometry"]],
    tracts[["geoid", "geometry"]],
    how="inner",
    predicate="intersects"
)

# 5. Weighted height sum for AH
bldg_in_tracts["height_area"] = (
    bldg_in_tracts["bldg_height"] * bldg_in_tracts["bldg_area"]
)

# 6. Aggregate by tract
bldg_stats = (
    bldg_in_tracts
    .groupby("geoid")
    .agg(
        total_bldg_area=("bldg_area", "sum"),
        total_height_area=("height_area", "sum")
    )
)

# 7. Join results back
tracts = tracts.join(bldg_stats, on="geoid")

tracts["total_bldg_area"] = tracts["total_bldg_area"].fillna(0)

# 8. BD (building density: footprint area ratio)
tracts["BD"] = tracts["total_bldg_area"] / tracts["tract_area"]

# 9. AH (area-weighted building height)
tracts["AH"] = tracts["total_height_area"] / tracts["total_bldg_area"]
tracts.loc[tracts["total_bldg_area"] == 0, "AH"] = np.nan

# Preview
tracts[["geoid", "BD", "AH"]].head()
geoid BD AH
0 36061000100 0.242965 22.344475
1 36061001401 0.170096 38.401079
2 36061001402 0.391586 39.186175
3 36061001800 0.407032 25.738760
4 36061002201 0.282754 25.412280
Code Cell 10
## Save the reusults
# Select only the variables we need (drop geometry implicitly)
tracts_vars = tracts[["geoid", "BD", "AH", "NDVI", "WCR"]].copy()

tracts_vars.columns = tracts_vars.columns.str.upper()

# Save to CSV (no index column)
tracts_vars.to_csv("data/additional_features/nyc_tracts_new_variables.csv", index=False)

tracts_vars
geoid BD AH NDVI WCR
0 36061000100 0.242965 22.344475 0.024685 0.017985
1 36061001401 0.170096 38.401079 0.074870 0.000000
2 36061001402 0.391586 39.186175 0.046529 0.000000
3 36061001800 0.407032 25.738760 0.041547 0.000000
4 36061002201 0.282754 25.412280 0.063541 0.000000
... ... ... ... ... ...
2320 36061009903 0.395851 111.507287 0.037452 0.057454
2321 36061011700 0.412318 48.834404 0.018437 0.024806
2322 36061002601 0.359600 16.882413 0.059028 0.000000
2323 36061003200 0.299328 20.454839 0.076770 0.000000
2324 36061000202 0.194488 20.841588 0.048946 0.001324

2325 rows × 5 columns

Code Cell 11
# Predictor lists.
env_variables = ["TREE_CANOPY_PCT", "IMPERVIOUS_RATIO", "WCR","NDVI"]
acs_variables = ["PCT_BACHELORS_PLUS", "PCT_RENTERS", "PCT_LIMITED_ENGLISH", "MEDIAN_INCOME"]
urban_variables = ["KNN_SUBWAY", "POI_DENSITY", "KNN_PARKS","BD","AH"]
← 3. Census ACS 5. NLCD Rasters →

Notebooks

This Notebook

Source: 04_additional_feature.ipynb

Code Cells: 11

Figures: 0