CODE: Additional Feature Engineering

Hot City, Heated Calls:
Understanding Extreme Heat and Quality of Life
Using New York City's 311 and SHAP

Computing POI density, subway distance, and other spatial accessibility metrics.

NDVI

Code Cell 1

## Module
from pathlib import Path
import geopandas as gpd
import rasterio
from rasterio.mask import mask
import numpy as np
import pandas as pd

Code Cell 2

## File Paths
tracts_path = Path("data/nyc_tracts_2020/nyc_tracts_2020.shp")
NDVI_dir = Path("data/raster/NDVI.tif")

Code Cell 3

# 1. Load NYC census tracts shapefile
tracts = gpd.read_file(tracts_path)

# 2. Open NDVI raster and ensure CRS matches the vector layer
with rasterio.open(NDVI_dir) as src:
    raster_crs = src.crs
    nodata = src.nodata

    # Reproject tracts to match raster CRS if necessary
    if tracts.crs != raster_crs:
        tracts = tracts.to_crs(raster_crs)

    ndvi_means = []

    # 3. Compute zonal mean NDVI for each tract
    for _, row in tracts.iterrows():
        geom = [row.geometry]

        # Mask raster using the tract polygon
        out_image, out_transform = mask(src, geom, crop=True)

        data = out_image[0].astype("float32")

        # Replace NoData values with NaN to avoid affecting mean
        if nodata is not None:
            data[data == nodata] = np.nan

        # Compute mean NDVI for this tract
        mean_ndvi = float(np.nanmean(data))
        ndvi_means.append(mean_ndvi)

# 4. Add NDVI results to GeoDataFrame
tracts["NDVI"] = ndvi_means

tracts.head()

	ctlabel	borocode	boroname	ct2020	boroct2020	cdeligibil	ntaname	nta2020	cdta2020	cdtaname	geoid	shape_leng	shape_area	geometry	NDVI
0	1	1	Manhattan	000100	1000100	I	The Battery-Governors Island-Ellis Island-Libe...	MN0191	MN01	MN01 Financial District-Tribeca (CD 1 Equivalent)	36061000100	10833.043929	1.843005e+06	MULTIPOLYGON (((580787.267 4504805.375, 580819...	0.024685
1	14.01	1	Manhattan	001401	1001401	I	Lower East Side	MN0302	MN03	MN03 Lower East Side-Chinatown (CD 3 Equivalent)	36061001401	5075.332000	1.006117e+06	POLYGON ((585444.188 4507772.701, 585514.711 4...	0.074870
2	14.02	1	Manhattan	001402	1001402	E	Lower East Side	MN0302	MN03	MN03 Lower East Side-Chinatown (CD 3 Equivalent)	36061001402	4459.156019	1.226206e+06	POLYGON ((585718.928 4508068.700, 585790.344 4...	0.046529
3	18	1	Manhattan	001800	1001800	I	Lower East Side	MN0302	MN03	MN03 Lower East Side-Chinatown (CD 3 Equivalent)	36061001800	6391.921174	2.399277e+06	POLYGON ((585313.281 4508223.568, 585324.698 4...	0.041547
4	22.01	1	Manhattan	002201	1002201	E	Lower East Side	MN0302	MN03	MN03 Lower East Side-Chinatown (CD 3 Equivalent)	36061002201	5779.062607	1.740174e+06	POLYGON ((586251.705 4508169.291, 586248.764 4...	0.063541

WCR

Code Cell 4

## File Paths
water_path = Path("data/Water shp/NYC_water.shp")

Code Cell 5

water = gpd.read_file(water_path)

tracts = tracts.to_crs(water.crs)

Code Cell 6

# 1. Compute tract area
tracts["tract_area"] = tracts.geometry.area

# 2. Intersect tracts with water polygons
#    This will create pieces of water polygons clipped by tract boundaries
water_in_tracts = gpd.overlay(
    tracts[["geoid", "geometry"]],
    water[["geometry"]],
    how="intersection"
)

# 3. Compute water area inside each tract
water_in_tracts["water_area"] = water_in_tracts.geometry.area

# 4. Aggregate water area by tract (GEOID)
water_area_by_tract = (
    water_in_tracts
    .groupby("geoid")["water_area"]
    .sum()
)

# 5. Map aggregated water area back to the tracts GeoDataFrame
tracts["water_area"] = tracts["geoid"].map(water_area_by_tract).fillna(0)

# 6. Compute Water Coverage Ratio (WCR = water area / tract area)
tracts["WCR"] = tracts["water_area"] / tracts["tract_area"]

# Check the result
tracts[["geoid", "NDVI", "WCR"]].head()

	geoid	NDVI	WCR
0	36061000100	0.024685	0.017985
1	36061001401	0.074870	0.000000
2	36061001402	0.046529	0.000000
3	36061001800	0.041547	0.000000
4	36061002201	0.063541	0.000000

BD and AH

Code Cell 7

building_path = Path("data/Buildings/geo_export_10da9e2c-833d-4ba4-9fe2-1f999ac16759.shp")
buildings = gpd.read_file(building_path)

Code Cell 8

buildings = buildings.to_crs(water.crs)
buildings.crs

<Projected CRS: EPSG:32618>
Name: WGS 84 / UTM zone 18N
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: Between 78°W and 72°W, northern hemisphere between equator and 84°N, onshore and offshore. Bahamas. Canada - Nunavut; Ontario; Quebec. Colombia. Cuba. Ecuador. Greenland. Haiti. Jamaica. Panama. Turks and Caicos Islands. United States (USA). Venezuela.
- bounds: (-78.0, 0.0, -72.0, 84.0)
Coordinate Operation:
- name: UTM zone 18N
- method: Transverse Mercator
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

Code Cell 9

# 1. Use height_roo as building height (it is in feet), convert to meters
buildings["bldg_height"] = buildings["height_roo"] * 0.3048

# 2. Compute building footprint area (in m² because CRS is UTM)
buildings["bldg_area"] = buildings.geometry.area

# 3. Compute tract area
tracts["tract_area"] = tracts.geometry.area

# 4. Spatial join — assign buildings to tracts
bldg_in_tracts = gpd.sjoin(
    buildings[["bldg_height", "bldg_area", "geometry"]],
    tracts[["geoid", "geometry"]],
    how="inner",
    predicate="intersects"
)

# 5. Weighted height sum for AH
bldg_in_tracts["height_area"] = (
    bldg_in_tracts["bldg_height"] * bldg_in_tracts["bldg_area"]
)

# 6. Aggregate by tract
bldg_stats = (
    bldg_in_tracts
    .groupby("geoid")
    .agg(
        total_bldg_area=("bldg_area", "sum"),
        total_height_area=("height_area", "sum")
    )
)

# 7. Join results back
tracts = tracts.join(bldg_stats, on="geoid")

tracts["total_bldg_area"] = tracts["total_bldg_area"].fillna(0)

# 8. BD (building density: footprint area ratio)
tracts["BD"] = tracts["total_bldg_area"] / tracts["tract_area"]

# 9. AH (area-weighted building height)
tracts["AH"] = tracts["total_height_area"] / tracts["total_bldg_area"]
tracts.loc[tracts["total_bldg_area"] == 0, "AH"] = np.nan

# Preview
tracts[["geoid", "BD", "AH"]].head()

	geoid	BD	AH
0	36061000100	0.242965	22.344475
1	36061001401	0.170096	38.401079
2	36061001402	0.391586	39.186175
3	36061001800	0.407032	25.738760
4	36061002201	0.282754	25.412280

Code Cell 10

## Save the reusults
# Select only the variables we need (drop geometry implicitly)
tracts_vars = tracts[["geoid", "BD", "AH", "NDVI", "WCR"]].copy()

tracts_vars.columns = tracts_vars.columns.str.upper()

# Save to CSV (no index column)
tracts_vars.to_csv("data/additional_features/nyc_tracts_new_variables.csv", index=False)

tracts_vars

	geoid	BD	AH	NDVI	WCR
0	36061000100	0.242965	22.344475	0.024685	0.017985
1	36061001401	0.170096	38.401079	0.074870	0.000000
2	36061001402	0.391586	39.186175	0.046529	0.000000
3	36061001800	0.407032	25.738760	0.041547	0.000000
4	36061002201	0.282754	25.412280	0.063541	0.000000
...	...	...	...	...	...
2320	36061009903	0.395851	111.507287	0.037452	0.057454
2321	36061011700	0.412318	48.834404	0.018437	0.024806
2322	36061002601	0.359600	16.882413	0.059028	0.000000
2323	36061003200	0.299328	20.454839	0.076770	0.000000
2324	36061000202	0.194488	20.841588	0.048946	0.001324

2325 rows × 5 columns

Code Cell 11

# Predictor lists.
env_variables = ["TREE_CANOPY_PCT", "IMPERVIOUS_RATIO", "WCR","NDVI"]
acs_variables = ["PCT_BACHELORS_PLUS", "PCT_RENTERS", "PCT_LIMITED_ENGLISH", "MEDIAN_INCOME"]
urban_variables = ["KNN_SUBWAY", "POI_DENSITY", "KNN_PARKS","BD","AH"]

← 3. Census ACS 5. NLCD Rasters →

CODE: Additional Feature Engineering

Hot City, Heated Calls:
Understanding Extreme Heat and Quality of Life
Using New York City's 311 and SHAP

NDVI

WCR

BD and AH

Notebooks

This Notebook

Data Pipeline

Weather → 311 → Census → Features → Merge → EDA → Modeling

CODE: Additional Feature Engineering

Hot City, Heated Calls:Understanding Extreme Heat and Quality of LifeUsing New York City's 311 and SHAP

NDVI

WCR

BD and AH

Notebooks

This Notebook

Data Pipeline

Weather → 311 → Census → Features → Merge → EDA → Modeling

Hot City, Heated Calls:
Understanding Extreme Heat and Quality of Life
Using New York City's 311 and SHAP