d517277ad96c5b7d9627d19b98e465e32bfe583c
public/postgis.md
... | ... | @@ -0,0 +1,351 @@ |
1 | +## Overview |
|
2 | + |
|
3 | +[PostGIS](http://postgis.net/docs/index.html) is the extension to PostgreSQL |
|
4 | +that provides for graphical information services objects to be utilized by |
|
5 | +the database. (PostGIS is pretty similar to what you get from Oracle Spatial.) |
|
6 | +The geocoding is supported by a PostGIS "extra" called |
|
7 | +[Tiger Geocoder](http://postgis.net/docs/Extras.html#Tiger_Geocoder) (it |
|
8 | +is in fact the only "extra"). [Tiger](https://www.census.gov/geo/maps-data/data/tiger.html) |
|
9 | +is a product of the United States Census Bureau. |
|
10 | + |
|
11 | +For PostgreSQL, the Tiger Geocoder is maintained in two schemas: `tiger` |
|
12 | +contains the code and the skeletal data structures. `tiger_data` has the |
|
13 | +national and state data which is downloaded from the Census. |
|
14 | + |
|
15 | +## Version |
|
16 | + |
|
17 | +Recommended: 10.x |
|
18 | + |
|
19 | +## Setup |
|
20 | + |
|
21 | +### Ensure that your PG install is clean; perhaps best to re-install |
|
22 | + |
|
23 | +#### Boxen |
|
24 | + |
|
25 | + # Stop any running PG; may want to do this through boxen-services |
|
26 | + pg_ctl -D /opt/boxen/homebrew/var/postgres stop |
|
27 | + |
|
28 | + # Remove the data directory |
|
29 | + rm -fR /opt/boxen/homebrew/var/postgres/* |
|
30 | + |
|
31 | + # Remove any old PostGIS and PostgreSQL |
|
32 | + brew unlink postgis |
|
33 | + brew unlink postgresql |
|
34 | + brew uninstall postgis |
|
35 | + brew uninstall postgresql |
|
36 | + |
|
37 | +#### Brew |
|
38 | + |
|
39 | +For Iora with ChirpStrap, |
|
40 | + |
|
41 | + brew services stop postgresql@9.4 |
|
42 | + brew uninstall postgresql@9.4 |
|
43 | + rm -fR /usr/local/var/postgresql\@9.4 |
|
44 | + brew list | grep postgres # expect see nothing |
|
45 | + |
|
46 | +### Install PostGIS |
|
47 | + |
|
48 | + brew install postgresql |
|
49 | + brew services start postgresql |
|
50 | + |
|
51 | + # to verify your install |
|
52 | + psql postgres |
|
53 | + \q |
|
54 | + |
|
55 | + # It can be handy to create a database with the name of your user so |
|
56 | + # you can just type `psql`: |
|
57 | + psql -d postgres -c "create database ${USER}" |
|
58 | + |
|
59 | + brew install postgis |
|
60 | + |
|
61 | +### Install the extensions |
|
62 | + |
|
63 | +For the official instructions, see <http://postgis.net/docs/postgis_installation.html#install_tiger_geocoder_extension> |
|
64 | + |
|
65 | + # Define a name for your database |
|
66 | + export DB=icisstaff_development |
|
67 | + |
|
68 | + psql -d $DB |
|
69 | + create extension postgis; |
|
70 | + create extension postgis_topology; |
|
71 | + create extension fuzzystrmatch; |
|
72 | + create extension address_standardizer; |
|
73 | + create extension postgis_tiger_geocoder; |
|
74 | + |
|
75 | +### Verify the address standardizer |
|
76 | + |
|
77 | + select num, street, city, state, zip |
|
78 | + from parse_address('1 Devonshire Place, Boston, MA 02109'); |
|
79 | + |
|
80 | +should produce a breakdown of street, city, state, zip: |
|
81 | + |
|
82 | +| num | street | city | state | zip | |
|
83 | +|-----|------------------|--------|-------|-------| |
|
84 | +| 1 | Devonshire Place | Boston | MA | 02109 | |
|
85 | + |
|
86 | +### Should also be able to . . . |
|
87 | + |
|
88 | + select na.address, na.streetname,na.streettypeabbrev, na.zip |
|
89 | + from normalize_address('1 Devonshire Place, Boston, MA 02109') as na; |
|
90 | + |
|
91 | +which produces: |
|
92 | + |
|
93 | +| address | streetname | streettypeabbrev | zip | |
|
94 | +|---------|------------|--------------------|-------| |
|
95 | +| 1 | Devonshire | Pl | 02109 | |
|
96 | + |
|
97 | +## Load from the Census |
|
98 | + |
|
99 | +### Set up your defaults for generated loader scripts |
|
100 | + |
|
101 | +Make a directory for your Tiger "staging" data (the raw files downloaded |
|
102 | +from the Census): |
|
103 | + |
|
104 | + mkdir -p $HOME/gisdata/temp |
|
105 | + |
|
106 | +When the loader scripts are generated, Tiger consults the |
|
107 | +`tiger.loader_variables` and `tiger.loader_platform` tables. First set up |
|
108 | +`tiger.loader_variables` so it knows about the "staging" folder: |
|
109 | + |
|
110 | + psql -d $DB -c "update tiger.loader_variables set staging_fold = '$HOME/gisdata'" |
|
111 | + |
|
112 | +Now you will create a row in `tiger.loader_platform` that fits better with |
|
113 | +the Brew PostgreSQL setup. The key for these rows is called `os` -- create |
|
114 | +one where `os` is the name of the database. |
|
115 | + |
|
116 | + psql -d $DB -X -c "delete from tiger.loader_platform where os = '$DB'" |
|
117 | + psql -d $DB <<EOF |
|
118 | + insert into tiger.loader_platform(os, declare_sect, pgbin, wget, unzip_command, psql, path_sep, |
|
119 | + loader, environ_set_command, county_process_command) |
|
120 | + select '$DB', |
|
121 | + E'TMPDIR="\${staging_fold}/temp/"\nUNZIPTOOL=unzip\nWGETTOOL=wget\nPSQL=\'/usr/local/bin/psql -d ${DB} \'\nSHP2PGSQL=shp2pgsql\ncd \${staging_fold}\n', |
|
122 | + pgbin, wget, unzip_command, psql, path_sep, |
|
123 | + loader, environ_set_command, county_process_command |
|
124 | + FROM tiger.loader_platform |
|
125 | + WHERE os = 'sh'; |
|
126 | + EOF |
|
127 | + |
|
128 | +### Generate the scripts to load the geocoder data |
|
129 | + |
|
130 | +Generate the national data, on which state data depends: |
|
131 | + |
|
132 | + psql -d $DB -X -c "SELECT Loader_Generate_Nation_Script('$DB')" -tA > $HOME/gisdata/nation_script_load.sh |
|
133 | + |
|
134 | +Generate the data for the states you want -- here, Massachusetts and Minnesota: |
|
135 | + |
|
136 | + psql -d $DB -X -c "SELECT Loader_Generate_Script(ARRAY['MA', 'MN'], '$DB')" -tA > $HOME/gisdata/ma_mn_states_load.sh |
|
137 | + |
|
138 | +It is tempting to try to do this for all states . . . |
|
139 | + |
|
140 | + psql -d $DB -X -c "SELECT Loader_Generate_Script(ARRAY['AL' , 'AK' , 'AS' , 'AZ' , 'AR' , 'CA' , 'CO' , 'MP' , 'CT' , 'DE' , 'DC' , 'FL' , 'GA' , 'GU' , 'HI' , 'ID' , 'IL' , 'IN' , 'IA' , 'KS' , 'KY' , 'LA' , 'ME' , 'MD' , 'MA' , 'MI' , 'MN' , 'MS' , 'MO' , 'MT' , 'NE' , 'NV' , 'NH' , 'NJ' , 'NM' , 'NY' , 'NC' , 'ND' , 'OH' , 'OK' , 'OR' , 'PA' , 'PR' , 'RI' , 'SC' , 'SD' , 'TN' , 'TX' , 'VI' , 'UT' , 'VT' , 'VA' , 'WA' , 'WV' , 'WI' , 'WY'], '$DB')" -tA > $HOME/gisdata/all_states_load.sh |
|
141 | + |
|
142 | +But experience suggests that you should load each state separately, so that if |
|
143 | +you have to, you can re-run a state: |
|
144 | + |
|
145 | + states=(AL AK AS AZ AR CA CO MP CT DE DC FL GA GU HI ID IL IN IA KS KY LA ME MD MA MI MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA PR RI SC SD TN TX VI UT VT VA WA WV WI WY) |
|
146 | + for state in "${states[@]}" |
|
147 | + do |
|
148 | + psql -d $DB -X -c "SELECT Loader_Generate_Script(ARRAY['${state}'], '$DB')" -tA > $HOME/gisdata/${state}_state_load.sh |
|
149 | + done |
|
150 | + |
|
151 | +Make the scripts runnable . . . |
|
152 | + |
|
153 | + chmod +x $HOME/gisdata/*.sh |
|
154 | + |
|
155 | +### Generate |
|
156 | + |
|
157 | +Run the loader scripts. |
|
158 | + |
|
159 | + cd $HOME/gisdata |
|
160 | + ./nation_script_load.sh |
|
161 | + ./AZ_state_load.sh |
|
162 | + ./MN_state_load.sh |
|
163 | + # etc. |
|
164 | + |
|
165 | +### Now find lat/long for an address you know in one of the states |
|
166 | + |
|
167 | + select |
|
168 | + g.rating, |
|
169 | + st_y(g.geomout) as lat, |
|
170 | + st_x(g.geomout) as lon, |
|
171 | + (addy).address as stno, |
|
172 | + (addy).streetname as street, |
|
173 | + (addy).streettypeabbrev as styp, |
|
174 | + (addy).location as city, |
|
175 | + (addy).stateabbrev as st, |
|
176 | + (addy).zip |
|
177 | + from geocode(normalize_address('931 Portland Ave, Saint Paul, MN'), 1) as g; |
|
178 | + |
|
179 | +If the data load has run correctly, you should see: |
|
180 | + |
|
181 | +| rating | lat | lon | stno | street | styp | city | st | zip | |
|
182 | +|--------|------------------|-------------------|------|----------|------|---------|----|-------| |
|
183 | +| 9 | 44.9429540910341 | -93.1394249137542 | 931 | Portland | Ave | St. Paul| MN | 55104 | |
|
184 | + |
|
185 | +## Sample queries |
|
186 | + |
|
187 | +Here are a few ways to leverage the geocoding. |
|
188 | + |
|
189 | +### Basics |
|
190 | + |
|
191 | +The key functions are |
|
192 | + |
|
193 | +* `normalize_address` - <http://postgis.net/docs/Normalize_Address.html> - |
|
194 | + Converts a messy address to a uniform one broken down into its parts; |
|
195 | + can then be used as input to geocode. NOTE: Does not require the downloaded |
|
196 | + Census data. |
|
197 | +* `geocode` - <http://postgis.net/docs/Geocode.html> - Converts an address |
|
198 | + to mapping data. Can take as input a string or the result from |
|
199 | + `normalize_address`. One thing I don't discuss here is that it is possible |
|
200 | + to constrain the results with a "geometry filter," so that if the input |
|
201 | + string does not have city and state, you can make the lookup happen within |
|
202 | + some city and state. |
|
203 | + * `st_x` and `st_y` - <http://postgis.net/docs/ST_X.html> - Get latitude or |
|
204 | + longitude from "point" data. (These are actually from postGIS, |
|
205 | + not the Tiger extension.) |
|
206 | +* `pprint_addy` - <http://postgis.net/docs/Pprint_Addy.html> - Pretty print |
|
207 | + an address from `normalize_address`. Does not require the downloaded Census data. |
|
208 | +* `reverse_geocode` - <http://postgis.net/docs/Reverse_Geocode.html> - Find |
|
209 | + an address for a lat/long location. (Not discussed here.) |
|
210 | +* `st_distance_sphere` - <http://postgis.net/docs/manual-1.5/ST_Distance_Sphere.html> - Compute distance as the bird flies. (Not discussed here.) |
|
211 | + |
|
212 | +The following examples use a sample table like this: |
|
213 | + |
|
214 | + drop table if exists people; |
|
215 | + create table people (pid int, first text, last text, address text); |
|
216 | + insert into people (pid, first, last, address) values (1, 'John', 'Norman', '931 Portland, Saint Paul, MN'); |
|
217 | + insert into people (pid, first, last, address) values (2, 'Idan', 'Ben-Arieh', '6 valley rd, Natick MA 01760'); |
|
218 | + insert into people (pid, first, last, address) values (3, 'Jane', 'Doe', '3048 W San Juan Drive, Tucson AZ 85713'); |
|
219 | + insert into people (pid, first, last, address) values (4, 'Doug', 'Doucey', '400 W Girley, Prescott, AZ'); |
|
220 | + |
|
221 | +First off, many of the PostGIS functions return PostgreSQL "composite types": |
|
222 | +to destructure, you enclose the returned composite in parentheses, like so: |
|
223 | + |
|
224 | + select |
|
225 | + address, |
|
226 | + (normalize_address(address)).streetname, |
|
227 | + (normalize_address(address)).zip |
|
228 | + from people; |
|
229 | + |
|
230 | +Notice that the `normalize_address` function is being called twice. Ideally |
|
231 | +we would like to call it once. A common pattern you will see is some kind |
|
232 | +of renamed query result or subquery: |
|
233 | + |
|
234 | + select address, (na).streetname, (na).zip from |
|
235 | + (select |
|
236 | + address, |
|
237 | + normalize_address(address) as na |
|
238 | + from people) people_with_normalized_address; |
|
239 | + |
|
240 | +Another example of destructuring, with geocoding: |
|
241 | + |
|
242 | + select |
|
243 | + address, |
|
244 | + st_y((g).geomout) as latitude, |
|
245 | + st_x((g).geomout) as longitude, |
|
246 | + (g).rating, |
|
247 | + pprint_addy((g).addy) as normalized_address |
|
248 | + from |
|
249 | + (select |
|
250 | + address, |
|
251 | + geocode(normalize_address(address),1) as g |
|
252 | + from people) people_with_geocoded_address; |
|
253 | + |
|
254 | +### Select the original data, augmenting with additional geo columns |
|
255 | + |
|
256 | +Aside: I imported this data from csv with a neat tool called `pgfutter`: <https://github.com/lukasmartinelli/pgfutter>. |
|
257 | + |
|
258 | +Here the table with the original data is `import.patients` |
|
259 | +with a primary/unique key of `patient_demographics_patient_guid` and the original address in `patient_demographics_address`. |
|
260 | + |
|
261 | + select import.patients.*, |
|
262 | + geocoded.latitude as Latitude, |
|
263 | + geocoded.longitude as Longitude, |
|
264 | + geocoded.rating as "Geocode Rating", |
|
265 | + geocoded.normalized_address as "Normalized Address" |
|
266 | + from |
|
267 | + import.patients |
|
268 | + left join |
|
269 | + (select |
|
270 | + patient_demographics_patient_guid, |
|
271 | + st_y((geo).geomout) as latitude, |
|
272 | + st_x((geo).geomout) as longitude, |
|
273 | + (geo).rating, |
|
274 | + pprint_addy((geo).addy) as normalized_address |
|
275 | + from (select |
|
276 | + patient_demographics_patient_guid, |
|
277 | + (geocode(normalize_address(patient_demographics_address),1)) as geo |
|
278 | + from import.patients) addresses) geocoded |
|
279 | + on geocoded.patient_demographics_patient_guid = import.patients.patient_demographics_patient_guid; |
|
280 | + |
|
281 | +(On my 2014-era MBP, this takes about 0.16 seconds / row; about 015 seconds |
|
282 | +with an index on `patient_demographics_patient_guid` -- `create index on import.patients (patient_demographics_patient_guid);`.) |
|
283 | + |
|
284 | +### Add columns for geocoding, and only geocode when never geocoded |
|
285 | + |
|
286 | +Finally, in this example, we add columns to a table without geocoding. |
|
287 | +Then we only geocode rows where the `rating` column is null. This would be |
|
288 | +a nice overnight task to get geocoded all new rows that have never been |
|
289 | +geocoded. |
|
290 | + |
|
291 | + alter table import.patients |
|
292 | + add column latitude double precision, |
|
293 | + add column longitude double precision, |
|
294 | + add column rating integer, |
|
295 | + add column normalized_address varchar; |
|
296 | + |
|
297 | +(To get the data types, I used the `pg_typeof` column on a query that |
|
298 | +returned the different items.) |
|
299 | + |
|
300 | +And here's the query (this is verbatim from the documentation). |
|
301 | + |
|
302 | +Note the use of `index 3` -- this is worth eyeballing before running on all |
|
303 | +rows. |
|
304 | + |
|
305 | + update import.patients |
|
306 | + set (latitude, longitude, rating, normalized_address) |
|
307 | + = ( |
|
308 | + st_y((g.geo).geomout), |
|
309 | + st_x((g.geo).geomout), |
|
310 | + coalesce((g.geo).rating,-1), |
|
311 | + pprint_addy((g.geo).addy) |
|
312 | + ) |
|
313 | + from (select patient_demographics_patient_guid |
|
314 | + from import.patients |
|
315 | + where rating is null order by patient_demographics_patient_guid limit 3) as a |
|
316 | + left join (select patient_demographics_patient_guid, (geocode(patient_demographics_address,1)) as geo |
|
317 | + from import.patients As ag |
|
318 | + where ag.rating is null order by patient_demographics_patient_guid limit 3) As g on a.patient_demographics_patient_guid = g.patient_demographics_patient_guid |
|
319 | + where a.patient_demographics_patient_guid = import.patients.patient_demographics_patient_guid; |
|
320 | + |
|
321 | +(About 0.20 seconds/row without indexing.) |
|
322 | + |
|
323 | +----------- |
|
324 | + |
|
325 | +### Visualization |
|
326 | + |
|
327 | +Here's a quick way to get a visualization on your laptop. |
|
328 | + |
|
329 | +First, install R. |
|
330 | + |
|
331 | + brew tap homebrew/science |
|
332 | + brew install R |
|
333 | + |
|
334 | +Export your geocoded table to a CSV. |
|
335 | + |
|
336 | + psql -c "copy import.patients to '$HOME/Desktop/patients.csv' with (format csv, header)" |
|
337 | + |
|
338 | +In R, |
|
339 | + |
|
340 | + install.packages('ggmap') |
|
341 | + library(ggmap) |
|
342 | + patients <- read.csv(file="/Users/jgn/Desktop/patients.csv", header=TRUE) |
|
343 | + |
|
344 | +The fastest way to get started with `ggmap` is to use the `qmplot` function. |
|
345 | +My advice is to read the docs and play around. |
|
346 | + |
|
347 | + qmplot(longitude, latitude, data=patients, source='stamen') |
|
348 | + qmplot(longitude, latitude, data=patients, source='google', maptype = 'roadmap') |
|
349 | + qmplot(longitude, latitude, data=patients, source='google', maptype = 'roadmap', zoom=10) |
|
350 | + qmplot(longitude, latitude, data=patients, source='google', maptype = 'roadmap', geom = 'density2d') |
|
351 | + qmplot(longitude, latitude, data=patients, source='google', maptype = 'roadmap', alpha=I(0.5), zoom=10, geom='density2d') |