CODE: NYC 311 Data Processing

Hot City, Heated Calls:
Understanding Extreme Heat and Quality of Life
Using New York City's 311 and SHAP

Processing 311 service requests and aggregating QoL complaints by census tract.

← 1b. Heat Classification 3. Census ACS →

311 QUALITY OF LIFE BY TRACTS

Extract quality-of-life-related 311 reports in NYC.

Code Cell 1
# Modules.
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
import requests, time
from pathlib import Path
Code Cell 2
# Paths.
nyc_311_dir = Path("data/nyc_311")
nyc_311 = nyc_311_dir / "311_raw"
nyc_311.mkdir(parents = True, exist_ok = True)

# NYC 2020 census tracts shapefile.
tracts_path = Path("data/nyc_tracts_2020/nyc_tracts_2020.shp")

# Output.
raw_path = nyc_311 / "nyc_311_summer_2025.csv"
panel_path = nyc_311_dir / "nyc_311_tract_day_2025.csv"
point_path_geojson = nyc_311_dir / "nyc_311_points_2025.geojson"
Code Cell 3
# Noise and Social Activity (heat-sensitive)
QOL_NOISE = [
    "LOUD MUSIC/PARTY", "BANGING/POUNDING", "LOUD TALKING",
    "CAR/TRUCK MUSIC", "CAR/TRUCK HORN", "DOG NOISE",
    "NOISE: BOAT(ENGINE,MUSIC,ETC) (NR10)",
    "NOISE: ALARMS (NR3)",
    "NOISE: AIR CONDITION/VENTILATION EQUIPMENT (NV1)",
    "NOISE: CONSTRUCTION BEFORE/AFTER HOURS (NM1)",
    "NOISE: JACK HAMMERING (NC2)",
    "NOISE, BARKING DOG (NR5)",
    "NOISE: MANUFACTURING NOISE (NK1)",
    "NOISE: OTHER NOISE SOURCES (USE COMMENTS) (NZZ)"
]

# Outdoor Activity / Public Space Use
QOL_OUTDOOR = [
    "BLOCKED HYDRANT", "BLOCKED SIDEWALK", "BLOCKED BIKE LANE",
    "ILLEGAL PARKING", "DOUBLE PARKED BLOCKING TRAFFIC",
    "BLOCKED CROSSWALK", "DERELICT VEHICLES", "CONGESTION/GRIDLOCK",
    "GRAFFITI", "CHRONIC DUMPING",
    "COMMERCIAL OVERNIGHT PARKING"
]

# Sanitation, Trash, Pests
QOL_SANITATION = [
    "GARBAGE OR LITTER", "TRASH", "OVERFLOWING",
    "RAT SIGHTING", "MOUSE SIGHTING", "CONDITION ATTRACTING RODENTS",
    "PESTS", "UNSANITARY CONDITION", "DEAD ANIMAL",
    "WASTE DISPOSAL", "DOG WASTE"
]

# Water Infrastructure & Hydrants
QOL_WATER = [
    "WATER LEAK", "WATER SUPPLY", "HYDRANT LEAKING (WC1)",
    "HYDRANT RUNNING FULL (WA4)", "HYDRANT RUNNING (WC3)",
    "HYDRANT DEFECTIVE (WC2)", "SEWER", "SEWER ODOR (SA2)",
    "SEWER BACKUP (SA)", "LEAK (USE COMMENTS) (WA2)"
]

# Infrastructure Heat Stress
QOL_INFRA_HEAT = [
    "POWER OUTAGE", "ELECTRICAL/GAS RANGE", "VENTILATION SYSTEM",
    "TRAFFIC SIGNAL LIGHT", "STREET LIGHT OUT", 
    "STREET LIGHT LAMP MISSING", "STREET LIGHT CYCLING"
]
Code Cell 4
# Build lookup dictionary for mapping.
def build_qol_lookup():
    mapping = {}
    for c in QOL_NOISE: mapping[c] = "QOL_NOISE"
    for c in QOL_OUTDOOR: mapping[c] = "QOL_OUTDOOR"
    for c in QOL_SANITATION: mapping[c] = "QOL_SANITATION"
    for c in QOL_WATER: mapping[c] = "QOL_WATER_INFRA"
    for c in QOL_INFRA_HEAT: mapping[c] = "QOL_INFRA_HEAT"
    return mapping

QOL_LOOKUP = build_qol_lookup()
Code Cell 5
# Download helper.
def download_311_jfk_2025(token = None):
    base = "https://data.cityofnewyork.us/resource/erm2-nwe9.json"
    headers = {"X-App-Token": token} if token else {}
    limit = 50000

    start = "2025-06-01T00:00:00"
    end = "2025-08-22T23:59:59"

    where_clause = (
        f"created_date between '{start}' and '{end}' "
        "AND latitude IS NOT NULL AND longitude IS NOT NULL"
    )

    cols = [
        "unique_key", "created_date", "complaint_type",
        "descriptor", "latitude", "longitude", "borough"
    ]

    offset = 0
    frames = []

    while True:
        params = {
            "$select": ",".join(cols),
            "$where": where_clause,
            "$limit": limit,
            "$offset": offset,
            "$order": "created_date"
        }
        r = requests.get(base, params = params, headers = headers)
        data = r.json()
        if len(data) == 0:
            break

        frames.append(pd.DataFrame(data))
        print("Fetched:", len(data), "offset:", offset)
        offset += limit
        time.sleep(0.3)

    df = pd.concat(frames, ignore_index = True)
    df.to_csv(raw_path, index = False)

    print("Saved:", raw_path)
    return df
Code Cell 6
# Download.
calls_311 = download_311_jfk_2025(token = None)
calls_311["created_date"] = pd.to_datetime(calls_311["created_date"], errors = "coerce")
calls_311["latitude"] = pd.to_numeric(calls_311["latitude"])
calls_311["longitude"] = pd.to_numeric(calls_311["longitude"])
Fetched: 50000 offset: 0
Fetched: 50000 offset: 50000
Fetched: 50000 offset: 100000
Fetched: 50000 offset: 150000
Fetched: 50000 offset: 200000
Fetched: 50000 offset: 250000
Fetched: 50000 offset: 300000
Fetched: 50000 offset: 350000
Fetched: 50000 offset: 400000
Fetched: 50000 offset: 450000
Fetched: 50000 offset: 500000
Fetched: 50000 offset: 550000
Fetched: 50000 offset: 600000
Fetched: 50000 offset: 650000
Fetched: 26191 offset: 700000
Saved: data\nyc_311\311_raw\nyc_311_summer_2025.csv
Code Cell 7
# Spatial join to tracts.
gdf_tracts = gpd.read_file(tracts_path)

nyc_prefixes = ("36005", "36047", "36061", "36081", "36085")
gdf_tracts = gdf_tracts[gdf_tracts["geoid"].str.startswith(nyc_prefixes)].copy()

gdf_311 = gpd.GeoDataFrame(
    calls_311,
    geometry=[Point(xy) for xy in zip(calls_311.longitude, calls_311.latitude)],
    crs = "EPSG:4326"
).to_crs(gdf_tracts.crs)

joined_gdf = gpd.sjoin(
    gdf_311,
    gdf_tracts[["geoid","geometry"]],
    how = "left",
    predicate = "within"
)

joined_gdf = joined_gdf.dropna(subset = ["geoid"]).copy()
joined_gdf.rename(columns = {"geoid":"GEOID"}, inplace = True)
Code Cell 8
# Encode to QoL superclasses.
joined_gdf["ct_norm"] = joined_gdf["complaint_type"].str.upper().str.strip()
joined_gdf["qol_category"] = joined_gdf["ct_norm"].map(QOL_LOOKUP)

# Heat-relevant QoL calls.
joined_gdf["is_heat_qol"] = joined_gdf["qol_category"].notna()

# Build tract by day.
joined_gdf["date"] = joined_gdf["created_date"].dt.date

panel = (
    joined_gdf.groupby(["GEOID","date"], as_index = False)
    .agg(
        total_calls = ("unique_key", "count"),
        qol_calls = ("is_heat_qol", "sum"),
        #mean_latitude = ("latitude", "mean"),
        #mean_longitude = ("longitude", "mean")
    )
)

panel["heat_qol_rate_1k"] = (panel["qol_calls"].astype(float))

panel["qol_pct"] = np.where(
    panel["total_calls"] > 0,
    panel["qol_calls"]/panel["total_calls"],
    np.nan
)

panel.columns = panel.columns.str.upper()
Code Cell 9
panel.to_csv(panel_path, index = False)
print("Saved panel:", panel_path)

panel
Saved panel: data/nyc_311/panel/nyc_311_tract_day_2025.csv
GEOID DATE TOTAL_CALLS QOL_CALLS HEAT_QOL_RATE_1K QOL_PCT
0 36005000100 2025-06-30 1 0 0.0 0.000
1 36005000100 2025-07-23 1 0 0.0 0.000
2 36005000100 2025-08-04 1 0 0.0 0.000
3 36005000200 2025-06-01 8 5 5.0 0.625
4 36005000200 2025-06-02 3 0 0.0 0.000
... ... ... ... ... ... ...
162339 36085032300 2025-08-15 1 0 0.0 0.000
162340 36085032300 2025-08-18 2 1 1.0 0.500
162341 36085032300 2025-08-19 4 4 4.0 1.000
162342 36085032300 2025-08-20 2 0 0.0 0.000
162343 36085032300 2025-08-21 1 0 0.0 0.000

162344 rows × 6 columns

Code Cell 10
# Save point aggregated data joined_gdf with GEOID assignment.
point_cols = [
    "unique_key", "created_date", "complaint_type", "descriptor",
    "latitude", "longitude", "borough", "GEOID", 
    "ct_norm", "qol_flag", "date", "geometry"
]

# Ensure only the necessary columns are kept and the index is dropped for clean output.
point_data_gdf = joined_gdf[point_cols].copy()

# Save GeoDataFrame to GeoJSON file.
point_data_gdf.to_file(point_path_geojson, driver = "GeoJSON")

print("Saved point data as GeoJSON:", point_path_geojson)
Saved point data as GeoJSON: data\nyc_311\nyc_311_points_2025.geojson
← 1b. Heat Classification 3. Census ACS →

Notebooks

This Notebook

Source: 02_311_tract_daily.ipynb

Code Cells: 10

Figures: 0