Automated-GIS-ML-System-for-Water-Infrastructure-Risk-in-Kenya/Spatial SQL.sql at main · frankraDIUM/Automated-GIS-ML-System-for-Water-Infrastructure-Risk-in-Kenya · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
-- Spatial SQL – Service area coverage
-- 1. Create or replace a function to get population within buffer for a given point
CREATE OR REPLACE FUNCTION get_population_served(point_geom geometry, buffer_meters double precision)
RETURNS double precision AS $$
DECLARE
  pop_sum double precision;
BEGIN
  SELECT COALESCE(SUM((ST_SummaryStats(ST_Clip(rast, ST_Buffer(point_geom, buffer_meters / 111320.0)))) .sum), 0)
  INTO pop_sum
  FROM population_raster
  WHERE ST_Intersects(rast, ST_Buffer(point_geom, buffer_meters / 111320.0));

  RETURN pop_sum;
END;
$$ LANGUAGE plpgsql;


-- 2. Add the column to the full table
ALTER TABLE water_points
ADD COLUMN IF NOT EXISTS pop_served_500m double precision;

-- 3. Populate the column (this may take a while — 21k+ points, each calling ST_Buffer + ST_Clip + ST_SummaryStats)
-- You can run this in batches if it takes too long (e.g. LIMIT 5000 first to test)
UPDATE water_points w
SET pop_served_500m = get_population_served(w.geometry, 500)
WHERE pop_served_500m IS NULL;   -- only update missing values

-- Checks
SELECT COUNT(*) AS updated_count
FROM water_points
WHERE pop_served_500m IS NOT NULL;

SELECT wpdx_id, status_clean, pop_served_500m
FROM water_points
ORDER BY pop_served_500m DESC
LIMIT 5;


-- 4. Underserved zones & clustering
-- Run clustering on underserved points (full Kenya)
DROP TABLE IF EXISTS non_functional_points;

CREATE TABLE non_functional_points AS
SELECT
    wpdx_id,
    status_clean,
    lat_deg,
    lon_deg,
    pop_served_500m,
    install_year,
    geometry
FROM water_points
WHERE status_clean NOT IN ('Functional', 'Functional, not in use')
ORDER BY pop_served_500m ASC;

CREATE INDEX IF NOT EXISTS idx_non_functional_geom
ON non_functional_points USING GIST (geometry);

SELECT COUNT(*) AS non_functional_count FROM non_functional_points;


-- 5. Clustering
-- Add cluster column
ALTER TABLE non_functional_points
ADD COLUMN IF NOT EXISTS cluster_id integer;

-- Cluster (1 km eps, min 5 points)
-- Perform the clustering and update in one go
WITH clusters AS (
    SELECT
        ctid,  -- system column to uniquely identify rows (important!)
        ST_ClusterDBSCAN(geometry, eps := 0.009, minpoints := 5) OVER () AS computed_cluster
    FROM non_functional_points
)
UPDATE non_functional_points n
SET cluster_id = c.computed_cluster
FROM clusters c
WHERE n.ctid = c.ctid;
);

-- View top clusters
SELECT
    cluster_id,
    COUNT(*) AS cluster_size,
    AVG(pop_served_500m) AS avg_pop_served,
    STRING_AGG(DISTINCT status_clean, ', ') AS status_types,
    ST_AsText(ST_Centroid(ST_Collect(geometry))) AS cluster_center
FROM non_functional_points
WHERE cluster_id IS NOT NULL
GROUP BY cluster_id
ORDER BY cluster_size DESC
LIMIT 10;


-- How many points were clustered vs noise/outliers
SELECT
    CASE WHEN cluster_id IS NULL THEN 'Noise/Outliers' ELSE 'Clustered' END AS type,
    COUNT(*) AS count
FROM non_functional_points
GROUP BY type;


-- Export clustered points (with cluster_id)
SELECT json_build_object(
    'type', 'FeatureCollection',
    'features', json_agg(
        json_build_object(
            'type', 'Feature',
            'geometry', ST_AsGeoJSON(geometry)::json,
            'properties', json_build_object(
                'wpdx_id', wpdx_id,
                'status_clean', status_clean,
                'pop_served_500m', pop_served_500m,
                'install_year', install_year,
                'cluster_id', cluster_id
            )
        )
    )
) AS geojson
FROM non_functional_points
WHERE cluster_id IS NOT NULL;


-- Export cluster centroids (one point per cluster with size & avg pop)
SELECT json_build_object(
    'type', 'FeatureCollection',
    'features', json_agg(
        json_build_object(
            'type', 'Feature',
            'geometry', ST_AsGeoJSON(centroid)::json,
            'properties', json_build_object(
                'cluster_id', cluster_id,
                'size', cluster_size,
                'avg_pop_served', avg_pop_served
            )
        )
    )
) AS geojson
FROM (
    SELECT
        cluster_id,
        COUNT(*) AS cluster_size,
        ROUND(AVG(pop_served_500m)::numeric, 0) AS avg_pop_served,
        ST_Centroid(ST_Collect(geometry)) AS centroid
    FROM non_functional_points
    WHERE cluster_id IS NOT NULL
    GROUP BY cluster_id
) AS sub;


-- 6. ML preparation & modeling

COPY (
    SELECT
        w.wpdx_id,
        w.lat_deg,
        w.lon_deg,
        w.status_clean,
        w.pop_served_500m,
        w.install_year,
        w.is_urban,
        w.water_source_clean,
        w.water_tech_clean,
        w.geometry,
        n.cluster_id
    FROM water_points w
    LEFT JOIN non_functional_points n ON w.wpdx_id = n.wpdx_id
) TO 'I:\GEO DATA ANALYSIS\Kenya Water\nairobi_water_project\water_points_ml_with_clusters.csv'
WITH (FORMAT CSV, HEADER TRUE, DELIMITER ',');


COPY (
    SELECT
        w.wpdx_id,
        w.lat_deg,
        w.lon_deg,
        w.status_clean,
        w.pop_served_500m,
        w.install_year,
        w.is_urban,
        w.water_source_clean,
        w.water_tech_clean,
        ST_AsText(w.geometry) AS geom_wkt,
        n.cluster_id
    FROM water_points w
    LEFT JOIN non_functional_points n ON w.wpdx_id = n.wpdx_id
) TO 'I:\GEO DATA ANALYSIS\Kenya Water\nairobi_water_project\water_points_ml_clean.csv'
WITH (FORMAT CSV, HEADER TRUE, DELIMITER ',');