Computing POI density, subway distance, and other spatial accessibility metrics.
## Module
from pathlib import Path
import geopandas as gpd
import rasterio
from rasterio.mask import mask
import numpy as np
import pandas as pd
## File Paths
tracts_path = Path("data/nyc_tracts_2020/nyc_tracts_2020.shp")
NDVI_dir = Path("data/raster/NDVI.tif")
# 1. Load NYC census tracts shapefile
tracts = gpd.read_file(tracts_path)
# 2. Open NDVI raster and ensure CRS matches the vector layer
with rasterio.open(NDVI_dir) as src:
raster_crs = src.crs
nodata = src.nodata
# Reproject tracts to match raster CRS if necessary
if tracts.crs != raster_crs:
tracts = tracts.to_crs(raster_crs)
ndvi_means = []
# 3. Compute zonal mean NDVI for each tract
for _, row in tracts.iterrows():
geom = [row.geometry]
# Mask raster using the tract polygon
out_image, out_transform = mask(src, geom, crop=True)
data = out_image[0].astype("float32")
# Replace NoData values with NaN to avoid affecting mean
if nodata is not None:
data[data == nodata] = np.nan
# Compute mean NDVI for this tract
mean_ndvi = float(np.nanmean(data))
ndvi_means.append(mean_ndvi)
# 4. Add NDVI results to GeoDataFrame
tracts["NDVI"] = ndvi_means
tracts.head()
| ctlabel | borocode | boroname | ct2020 | boroct2020 | cdeligibil | ntaname | nta2020 | cdta2020 | cdtaname | geoid | shape_leng | shape_area | geometry | NDVI | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | Manhattan | 000100 | 1000100 | I | The Battery-Governors Island-Ellis Island-Libe... | MN0191 | MN01 | MN01 Financial District-Tribeca (CD 1 Equivalent) | 36061000100 | 10833.043929 | 1.843005e+06 | MULTIPOLYGON (((580787.267 4504805.375, 580819... | 0.024685 |
| 1 | 14.01 | 1 | Manhattan | 001401 | 1001401 | I | Lower East Side | MN0302 | MN03 | MN03 Lower East Side-Chinatown (CD 3 Equivalent) | 36061001401 | 5075.332000 | 1.006117e+06 | POLYGON ((585444.188 4507772.701, 585514.711 4... | 0.074870 |
| 2 | 14.02 | 1 | Manhattan | 001402 | 1001402 | E | Lower East Side | MN0302 | MN03 | MN03 Lower East Side-Chinatown (CD 3 Equivalent) | 36061001402 | 4459.156019 | 1.226206e+06 | POLYGON ((585718.928 4508068.700, 585790.344 4... | 0.046529 |
| 3 | 18 | 1 | Manhattan | 001800 | 1001800 | I | Lower East Side | MN0302 | MN03 | MN03 Lower East Side-Chinatown (CD 3 Equivalent) | 36061001800 | 6391.921174 | 2.399277e+06 | POLYGON ((585313.281 4508223.568, 585324.698 4... | 0.041547 |
| 4 | 22.01 | 1 | Manhattan | 002201 | 1002201 | E | Lower East Side | MN0302 | MN03 | MN03 Lower East Side-Chinatown (CD 3 Equivalent) | 36061002201 | 5779.062607 | 1.740174e+06 | POLYGON ((586251.705 4508169.291, 586248.764 4... | 0.063541 |
## File Paths
water_path = Path("data/Water shp/NYC_water.shp")
water = gpd.read_file(water_path)
tracts = tracts.to_crs(water.crs)
# 1. Compute tract area
tracts["tract_area"] = tracts.geometry.area
# 2. Intersect tracts with water polygons
# This will create pieces of water polygons clipped by tract boundaries
water_in_tracts = gpd.overlay(
tracts[["geoid", "geometry"]],
water[["geometry"]],
how="intersection"
)
# 3. Compute water area inside each tract
water_in_tracts["water_area"] = water_in_tracts.geometry.area
# 4. Aggregate water area by tract (GEOID)
water_area_by_tract = (
water_in_tracts
.groupby("geoid")["water_area"]
.sum()
)
# 5. Map aggregated water area back to the tracts GeoDataFrame
tracts["water_area"] = tracts["geoid"].map(water_area_by_tract).fillna(0)
# 6. Compute Water Coverage Ratio (WCR = water area / tract area)
tracts["WCR"] = tracts["water_area"] / tracts["tract_area"]
# Check the result
tracts[["geoid", "NDVI", "WCR"]].head()
| geoid | NDVI | WCR | |
|---|---|---|---|
| 0 | 36061000100 | 0.024685 | 0.017985 |
| 1 | 36061001401 | 0.074870 | 0.000000 |
| 2 | 36061001402 | 0.046529 | 0.000000 |
| 3 | 36061001800 | 0.041547 | 0.000000 |
| 4 | 36061002201 | 0.063541 | 0.000000 |
building_path = Path("data/Buildings/geo_export_10da9e2c-833d-4ba4-9fe2-1f999ac16759.shp")
buildings = gpd.read_file(building_path)
buildings = buildings.to_crs(water.crs)
buildings.crs
<Projected CRS: EPSG:32618> Name: WGS 84 / UTM zone 18N Axis Info [cartesian]: - E[east]: Easting (metre) - N[north]: Northing (metre) Area of Use: - name: Between 78°W and 72°W, northern hemisphere between equator and 84°N, onshore and offshore. Bahamas. Canada - Nunavut; Ontario; Quebec. Colombia. Cuba. Ecuador. Greenland. Haiti. Jamaica. Panama. Turks and Caicos Islands. United States (USA). Venezuela. - bounds: (-78.0, 0.0, -72.0, 84.0) Coordinate Operation: - name: UTM zone 18N - method: Transverse Mercator Datum: World Geodetic System 1984 ensemble - Ellipsoid: WGS 84 - Prime Meridian: Greenwich
# 1. Use height_roo as building height (it is in feet), convert to meters
buildings["bldg_height"] = buildings["height_roo"] * 0.3048
# 2. Compute building footprint area (in m² because CRS is UTM)
buildings["bldg_area"] = buildings.geometry.area
# 3. Compute tract area
tracts["tract_area"] = tracts.geometry.area
# 4. Spatial join — assign buildings to tracts
bldg_in_tracts = gpd.sjoin(
buildings[["bldg_height", "bldg_area", "geometry"]],
tracts[["geoid", "geometry"]],
how="inner",
predicate="intersects"
)
# 5. Weighted height sum for AH
bldg_in_tracts["height_area"] = (
bldg_in_tracts["bldg_height"] * bldg_in_tracts["bldg_area"]
)
# 6. Aggregate by tract
bldg_stats = (
bldg_in_tracts
.groupby("geoid")
.agg(
total_bldg_area=("bldg_area", "sum"),
total_height_area=("height_area", "sum")
)
)
# 7. Join results back
tracts = tracts.join(bldg_stats, on="geoid")
tracts["total_bldg_area"] = tracts["total_bldg_area"].fillna(0)
# 8. BD (building density: footprint area ratio)
tracts["BD"] = tracts["total_bldg_area"] / tracts["tract_area"]
# 9. AH (area-weighted building height)
tracts["AH"] = tracts["total_height_area"] / tracts["total_bldg_area"]
tracts.loc[tracts["total_bldg_area"] == 0, "AH"] = np.nan
# Preview
tracts[["geoid", "BD", "AH"]].head()
| geoid | BD | AH | |
|---|---|---|---|
| 0 | 36061000100 | 0.242965 | 22.344475 |
| 1 | 36061001401 | 0.170096 | 38.401079 |
| 2 | 36061001402 | 0.391586 | 39.186175 |
| 3 | 36061001800 | 0.407032 | 25.738760 |
| 4 | 36061002201 | 0.282754 | 25.412280 |
## Save the reusults
# Select only the variables we need (drop geometry implicitly)
tracts_vars = tracts[["geoid", "BD", "AH", "NDVI", "WCR"]].copy()
tracts_vars.columns = tracts_vars.columns.str.upper()
# Save to CSV (no index column)
tracts_vars.to_csv("data/additional_features/nyc_tracts_new_variables.csv", index=False)
tracts_vars
| geoid | BD | AH | NDVI | WCR | |
|---|---|---|---|---|---|
| 0 | 36061000100 | 0.242965 | 22.344475 | 0.024685 | 0.017985 |
| 1 | 36061001401 | 0.170096 | 38.401079 | 0.074870 | 0.000000 |
| 2 | 36061001402 | 0.391586 | 39.186175 | 0.046529 | 0.000000 |
| 3 | 36061001800 | 0.407032 | 25.738760 | 0.041547 | 0.000000 |
| 4 | 36061002201 | 0.282754 | 25.412280 | 0.063541 | 0.000000 |
| ... | ... | ... | ... | ... | ... |
| 2320 | 36061009903 | 0.395851 | 111.507287 | 0.037452 | 0.057454 |
| 2321 | 36061011700 | 0.412318 | 48.834404 | 0.018437 | 0.024806 |
| 2322 | 36061002601 | 0.359600 | 16.882413 | 0.059028 | 0.000000 |
| 2323 | 36061003200 | 0.299328 | 20.454839 | 0.076770 | 0.000000 |
| 2324 | 36061000202 | 0.194488 | 20.841588 | 0.048946 | 0.001324 |
2325 rows × 5 columns
# Predictor lists.
env_variables = ["TREE_CANOPY_PCT", "IMPERVIOUS_RATIO", "WCR","NDVI"]
acs_variables = ["PCT_BACHELORS_PLUS", "PCT_RENTERS", "PCT_LIMITED_ENGLISH", "MEDIAN_INCOME"]
urban_variables = ["KNN_SUBWAY", "POI_DENSITY", "KNN_PARKS","BD","AH"]
Source: 04_additional_feature.ipynb
Code Cells: 11
Figures: 0