public/postgis.md
... ...
@@ -0,0 +1,351 @@
1
+## Overview
2
+
3
+[PostGIS](http://postgis.net/docs/index.html) is the extension to PostgreSQL
4
+that provides for graphical information services objects to be utilized by
5
+the database. (PostGIS is pretty similar to what you get from Oracle Spatial.)
6
+The geocoding is supported by a PostGIS "extra" called
7
+[Tiger Geocoder](http://postgis.net/docs/Extras.html#Tiger_Geocoder) (it
8
+is in fact the only "extra"). [Tiger](https://www.census.gov/geo/maps-data/data/tiger.html)
9
+is a product of the United States Census Bureau.
10
+
11
+For PostgreSQL, the Tiger Geocoder is maintained in two schemas: `tiger`
12
+contains the code and the skeletal data structures. `tiger_data` has the
13
+national and state data which is downloaded from the Census.
14
+
15
+## Version
16
+
17
+Recommended: 10.x
18
+
19
+## Setup
20
+
21
+### Ensure that your PG install is clean; perhaps best to re-install
22
+
23
+#### Boxen
24
+
25
+ # Stop any running PG; may want to do this through boxen-services
26
+ pg_ctl -D /opt/boxen/homebrew/var/postgres stop
27
+
28
+ # Remove the data directory
29
+ rm -fR /opt/boxen/homebrew/var/postgres/*
30
+
31
+ # Remove any old PostGIS and PostgreSQL
32
+ brew unlink postgis
33
+ brew unlink postgresql
34
+ brew uninstall postgis
35
+ brew uninstall postgresql
36
+
37
+#### Brew
38
+
39
+For Iora with ChirpStrap,
40
+
41
+ brew services stop postgresql@9.4
42
+ brew uninstall postgresql@9.4
43
+ rm -fR /usr/local/var/postgresql\@9.4
44
+ brew list | grep postgres # expect see nothing
45
+
46
+### Install PostGIS
47
+
48
+ brew install postgresql
49
+ brew services start postgresql
50
+
51
+ # to verify your install
52
+ psql postgres
53
+ \q
54
+
55
+ # It can be handy to create a database with the name of your user so
56
+ # you can just type `psql`:
57
+ psql -d postgres -c "create database ${USER}"
58
+
59
+ brew install postgis
60
+
61
+### Install the extensions
62
+
63
+For the official instructions, see <http://postgis.net/docs/postgis_installation.html#install_tiger_geocoder_extension>
64
+
65
+ # Define a name for your database
66
+ export DB=icisstaff_development
67
+
68
+ psql -d $DB
69
+ create extension postgis;
70
+ create extension postgis_topology;
71
+ create extension fuzzystrmatch;
72
+ create extension address_standardizer;
73
+ create extension postgis_tiger_geocoder;
74
+
75
+### Verify the address standardizer
76
+
77
+ select num, street, city, state, zip
78
+ from parse_address('1 Devonshire Place, Boston, MA 02109');
79
+
80
+should produce a breakdown of street, city, state, zip:
81
+
82
+| num | street | city | state | zip |
83
+|-----|------------------|--------|-------|-------|
84
+| 1 | Devonshire Place | Boston | MA | 02109 |
85
+
86
+### Should also be able to . . .
87
+
88
+ select na.address, na.streetname,na.streettypeabbrev, na.zip
89
+ from normalize_address('1 Devonshire Place, Boston, MA 02109') as na;
90
+
91
+which produces:
92
+
93
+| address | streetname | streettypeabbrev | zip |
94
+|---------|------------|--------------------|-------|
95
+| 1 | Devonshire | Pl | 02109 |
96
+
97
+## Load from the Census
98
+
99
+### Set up your defaults for generated loader scripts
100
+
101
+Make a directory for your Tiger "staging" data (the raw files downloaded
102
+from the Census):
103
+
104
+ mkdir -p $HOME/gisdata/temp
105
+
106
+When the loader scripts are generated, Tiger consults the
107
+`tiger.loader_variables` and `tiger.loader_platform` tables. First set up
108
+`tiger.loader_variables` so it knows about the "staging" folder:
109
+
110
+ psql -d $DB -c "update tiger.loader_variables set staging_fold = '$HOME/gisdata'"
111
+
112
+Now you will create a row in `tiger.loader_platform` that fits better with
113
+the Brew PostgreSQL setup. The key for these rows is called `os` -- create
114
+one where `os` is the name of the database.
115
+
116
+ psql -d $DB -X -c "delete from tiger.loader_platform where os = '$DB'"
117
+ psql -d $DB <<EOF
118
+ insert into tiger.loader_platform(os, declare_sect, pgbin, wget, unzip_command, psql, path_sep,
119
+ loader, environ_set_command, county_process_command)
120
+ select '$DB',
121
+ E'TMPDIR="\${staging_fold}/temp/"\nUNZIPTOOL=unzip\nWGETTOOL=wget\nPSQL=\'/usr/local/bin/psql -d ${DB} \'\nSHP2PGSQL=shp2pgsql\ncd \${staging_fold}\n',
122
+ pgbin, wget, unzip_command, psql, path_sep,
123
+ loader, environ_set_command, county_process_command
124
+ FROM tiger.loader_platform
125
+ WHERE os = 'sh';
126
+ EOF
127
+
128
+### Generate the scripts to load the geocoder data
129
+
130
+Generate the national data, on which state data depends:
131
+
132
+ psql -d $DB -X -c "SELECT Loader_Generate_Nation_Script('$DB')" -tA > $HOME/gisdata/nation_script_load.sh
133
+
134
+Generate the data for the states you want -- here, Massachusetts and Minnesota:
135
+
136
+ psql -d $DB -X -c "SELECT Loader_Generate_Script(ARRAY['MA', 'MN'], '$DB')" -tA > $HOME/gisdata/ma_mn_states_load.sh
137
+
138
+It is tempting to try to do this for all states . . .
139
+
140
+ psql -d $DB -X -c "SELECT Loader_Generate_Script(ARRAY['AL' , 'AK' , 'AS' , 'AZ' , 'AR' , 'CA' , 'CO' , 'MP' , 'CT' , 'DE' , 'DC' , 'FL' , 'GA' , 'GU' , 'HI' , 'ID' , 'IL' , 'IN' , 'IA' , 'KS' , 'KY' , 'LA' , 'ME' , 'MD' , 'MA' , 'MI' , 'MN' , 'MS' , 'MO' , 'MT' , 'NE' , 'NV' , 'NH' , 'NJ' , 'NM' , 'NY' , 'NC' , 'ND' , 'OH' , 'OK' , 'OR' , 'PA' , 'PR' , 'RI' , 'SC' , 'SD' , 'TN' , 'TX' , 'VI' , 'UT' , 'VT' , 'VA' , 'WA' , 'WV' , 'WI' , 'WY'], '$DB')" -tA > $HOME/gisdata/all_states_load.sh
141
+
142
+But experience suggests that you should load each state separately, so that if
143
+you have to, you can re-run a state:
144
+
145
+ states=(AL AK AS AZ AR CA CO MP CT DE DC FL GA GU HI ID IL IN IA KS KY LA ME MD MA MI MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA PR RI SC SD TN TX VI UT VT VA WA WV WI WY)
146
+ for state in "${states[@]}"
147
+ do
148
+ psql -d $DB -X -c "SELECT Loader_Generate_Script(ARRAY['${state}'], '$DB')" -tA > $HOME/gisdata/${state}_state_load.sh
149
+ done
150
+
151
+Make the scripts runnable . . .
152
+
153
+ chmod +x $HOME/gisdata/*.sh
154
+
155
+### Generate
156
+
157
+Run the loader scripts.
158
+
159
+ cd $HOME/gisdata
160
+ ./nation_script_load.sh
161
+ ./AZ_state_load.sh
162
+ ./MN_state_load.sh
163
+ # etc.
164
+
165
+### Now find lat/long for an address you know in one of the states
166
+
167
+ select
168
+ g.rating,
169
+ st_y(g.geomout) as lat,
170
+ st_x(g.geomout) as lon,
171
+ (addy).address as stno,
172
+ (addy).streetname as street,
173
+ (addy).streettypeabbrev as styp,
174
+ (addy).location as city,
175
+ (addy).stateabbrev as st,
176
+ (addy).zip
177
+ from geocode(normalize_address('931 Portland Ave, Saint Paul, MN'), 1) as g;
178
+
179
+If the data load has run correctly, you should see:
180
+
181
+| rating | lat | lon | stno | street | styp | city | st | zip |
182
+|--------|------------------|-------------------|------|----------|------|---------|----|-------|
183
+| 9 | 44.9429540910341 | -93.1394249137542 | 931 | Portland | Ave | St. Paul| MN | 55104 |
184
+
185
+## Sample queries
186
+
187
+Here are a few ways to leverage the geocoding.
188
+
189
+### Basics
190
+
191
+The key functions are
192
+
193
+* `normalize_address` - <http://postgis.net/docs/Normalize_Address.html> -
194
+ Converts a messy address to a uniform one broken down into its parts;
195
+ can then be used as input to geocode. NOTE: Does not require the downloaded
196
+ Census data.
197
+* `geocode` - <http://postgis.net/docs/Geocode.html> - Converts an address
198
+ to mapping data. Can take as input a string or the result from
199
+ `normalize_address`. One thing I don't discuss here is that it is possible
200
+ to constrain the results with a "geometry filter," so that if the input
201
+ string does not have city and state, you can make the lookup happen within
202
+ some city and state.
203
+ * `st_x` and `st_y` - <http://postgis.net/docs/ST_X.html> - Get latitude or
204
+ longitude from "point" data. (These are actually from postGIS,
205
+ not the Tiger extension.)
206
+* `pprint_addy` - <http://postgis.net/docs/Pprint_Addy.html> - Pretty print
207
+ an address from `normalize_address`. Does not require the downloaded Census data.
208
+* `reverse_geocode` - <http://postgis.net/docs/Reverse_Geocode.html> - Find
209
+ an address for a lat/long location. (Not discussed here.)
210
+* `st_distance_sphere` - <http://postgis.net/docs/manual-1.5/ST_Distance_Sphere.html> - Compute distance as the bird flies. (Not discussed here.)
211
+
212
+The following examples use a sample table like this:
213
+
214
+ drop table if exists people;
215
+ create table people (pid int, first text, last text, address text);
216
+ insert into people (pid, first, last, address) values (1, 'John', 'Norman', '931 Portland, Saint Paul, MN');
217
+ insert into people (pid, first, last, address) values (2, 'Idan', 'Ben-Arieh', '6 valley rd, Natick MA 01760');
218
+ insert into people (pid, first, last, address) values (3, 'Jane', 'Doe', '3048 W San Juan Drive, Tucson AZ 85713');
219
+ insert into people (pid, first, last, address) values (4, 'Doug', 'Doucey', '400 W Girley, Prescott, AZ');
220
+
221
+First off, many of the PostGIS functions return PostgreSQL "composite types":
222
+to destructure, you enclose the returned composite in parentheses, like so:
223
+
224
+ select
225
+ address,
226
+ (normalize_address(address)).streetname,
227
+ (normalize_address(address)).zip
228
+ from people;
229
+
230
+Notice that the `normalize_address` function is being called twice. Ideally
231
+we would like to call it once. A common pattern you will see is some kind
232
+of renamed query result or subquery:
233
+
234
+ select address, (na).streetname, (na).zip from
235
+ (select
236
+ address,
237
+ normalize_address(address) as na
238
+ from people) people_with_normalized_address;
239
+
240
+Another example of destructuring, with geocoding:
241
+
242
+ select
243
+ address,
244
+ st_y((g).geomout) as latitude,
245
+ st_x((g).geomout) as longitude,
246
+ (g).rating,
247
+ pprint_addy((g).addy) as normalized_address
248
+ from
249
+ (select
250
+ address,
251
+ geocode(normalize_address(address),1) as g
252
+ from people) people_with_geocoded_address;
253
+
254
+### Select the original data, augmenting with additional geo columns
255
+
256
+Aside: I imported this data from csv with a neat tool called `pgfutter`: <https://github.com/lukasmartinelli/pgfutter>.
257
+
258
+Here the table with the original data is `import.patients`
259
+with a primary/unique key of `patient_demographics_patient_guid` and the original address in `patient_demographics_address`.
260
+
261
+ select import.patients.*,
262
+ geocoded.latitude as Latitude,
263
+ geocoded.longitude as Longitude,
264
+ geocoded.rating as "Geocode Rating",
265
+ geocoded.normalized_address as "Normalized Address"
266
+ from
267
+ import.patients
268
+ left join
269
+ (select
270
+ patient_demographics_patient_guid,
271
+ st_y((geo).geomout) as latitude,
272
+ st_x((geo).geomout) as longitude,
273
+ (geo).rating,
274
+ pprint_addy((geo).addy) as normalized_address
275
+ from (select
276
+ patient_demographics_patient_guid,
277
+ (geocode(normalize_address(patient_demographics_address),1)) as geo
278
+ from import.patients) addresses) geocoded
279
+ on geocoded.patient_demographics_patient_guid = import.patients.patient_demographics_patient_guid;
280
+
281
+(On my 2014-era MBP, this takes about 0.16 seconds / row; about 015 seconds
282
+with an index on `patient_demographics_patient_guid` -- `create index on import.patients (patient_demographics_patient_guid);`.)
283
+
284
+### Add columns for geocoding, and only geocode when never geocoded
285
+
286
+Finally, in this example, we add columns to a table without geocoding.
287
+Then we only geocode rows where the `rating` column is null. This would be
288
+a nice overnight task to get geocoded all new rows that have never been
289
+geocoded.
290
+
291
+ alter table import.patients
292
+ add column latitude double precision,
293
+ add column longitude double precision,
294
+ add column rating integer,
295
+ add column normalized_address varchar;
296
+
297
+(To get the data types, I used the `pg_typeof` column on a query that
298
+returned the different items.)
299
+
300
+And here's the query (this is verbatim from the documentation).
301
+
302
+Note the use of `index 3` -- this is worth eyeballing before running on all
303
+rows.
304
+
305
+ update import.patients
306
+ set (latitude, longitude, rating, normalized_address)
307
+ = (
308
+ st_y((g.geo).geomout),
309
+ st_x((g.geo).geomout),
310
+ coalesce((g.geo).rating,-1),
311
+ pprint_addy((g.geo).addy)
312
+ )
313
+ from (select patient_demographics_patient_guid
314
+ from import.patients
315
+ where rating is null order by patient_demographics_patient_guid limit 3) as a
316
+ left join (select patient_demographics_patient_guid, (geocode(patient_demographics_address,1)) as geo
317
+ from import.patients As ag
318
+ where ag.rating is null order by patient_demographics_patient_guid limit 3) As g on a.patient_demographics_patient_guid = g.patient_demographics_patient_guid
319
+ where a.patient_demographics_patient_guid = import.patients.patient_demographics_patient_guid;
320
+
321
+(About 0.20 seconds/row without indexing.)
322
+
323
+-----------
324
+
325
+### Visualization
326
+
327
+Here's a quick way to get a visualization on your laptop.
328
+
329
+First, install R.
330
+
331
+ brew tap homebrew/science
332
+ brew install R
333
+
334
+Export your geocoded table to a CSV.
335
+
336
+ psql -c "copy import.patients to '$HOME/Desktop/patients.csv' with (format csv, header)"
337
+
338
+In R,
339
+
340
+ install.packages('ggmap')
341
+ library(ggmap)
342
+ patients <- read.csv(file="/Users/jgn/Desktop/patients.csv", header=TRUE)
343
+
344
+The fastest way to get started with `ggmap` is to use the `qmplot` function.
345
+My advice is to read the docs and play around.
346
+
347
+ qmplot(longitude, latitude, data=patients, source='stamen')
348
+ qmplot(longitude, latitude, data=patients, source='google', maptype = 'roadmap')
349
+ qmplot(longitude, latitude, data=patients, source='google', maptype = 'roadmap', zoom=10)
350
+ qmplot(longitude, latitude, data=patients, source='google', maptype = 'roadmap', geom = 'density2d')
351
+ qmplot(longitude, latitude, data=patients, source='google', maptype = 'roadmap', alpha=I(0.5), zoom=10, geom='density2d')