d517277ad96c5b7d9627d19b98e465e32bfe583c
public/postgis.md
| ... | ... | @@ -0,0 +1,351 @@ |
| 1 | +## Overview |
|
| 2 | + |
|
| 3 | +[PostGIS](http://postgis.net/docs/index.html) is the extension to PostgreSQL |
|
| 4 | +that provides for graphical information services objects to be utilized by |
|
| 5 | +the database. (PostGIS is pretty similar to what you get from Oracle Spatial.) |
|
| 6 | +The geocoding is supported by a PostGIS "extra" called |
|
| 7 | +[Tiger Geocoder](http://postgis.net/docs/Extras.html#Tiger_Geocoder) (it |
|
| 8 | +is in fact the only "extra"). [Tiger](https://www.census.gov/geo/maps-data/data/tiger.html) |
|
| 9 | +is a product of the United States Census Bureau. |
|
| 10 | + |
|
| 11 | +For PostgreSQL, the Tiger Geocoder is maintained in two schemas: `tiger` |
|
| 12 | +contains the code and the skeletal data structures. `tiger_data` has the |
|
| 13 | +national and state data which is downloaded from the Census. |
|
| 14 | + |
|
| 15 | +## Version |
|
| 16 | + |
|
| 17 | +Recommended: 10.x |
|
| 18 | + |
|
| 19 | +## Setup |
|
| 20 | + |
|
| 21 | +### Ensure that your PG install is clean; perhaps best to re-install |
|
| 22 | + |
|
| 23 | +#### Boxen |
|
| 24 | + |
|
| 25 | + # Stop any running PG; may want to do this through boxen-services |
|
| 26 | + pg_ctl -D /opt/boxen/homebrew/var/postgres stop |
|
| 27 | + |
|
| 28 | + # Remove the data directory |
|
| 29 | + rm -fR /opt/boxen/homebrew/var/postgres/* |
|
| 30 | + |
|
| 31 | + # Remove any old PostGIS and PostgreSQL |
|
| 32 | + brew unlink postgis |
|
| 33 | + brew unlink postgresql |
|
| 34 | + brew uninstall postgis |
|
| 35 | + brew uninstall postgresql |
|
| 36 | + |
|
| 37 | +#### Brew |
|
| 38 | + |
|
| 39 | +For Iora with ChirpStrap, |
|
| 40 | + |
|
| 41 | + brew services stop postgresql@9.4 |
|
| 42 | + brew uninstall postgresql@9.4 |
|
| 43 | + rm -fR /usr/local/var/postgresql\@9.4 |
|
| 44 | + brew list | grep postgres # expect see nothing |
|
| 45 | + |
|
| 46 | +### Install PostGIS |
|
| 47 | + |
|
| 48 | + brew install postgresql |
|
| 49 | + brew services start postgresql |
|
| 50 | + |
|
| 51 | + # to verify your install |
|
| 52 | + psql postgres |
|
| 53 | + \q |
|
| 54 | + |
|
| 55 | + # It can be handy to create a database with the name of your user so |
|
| 56 | + # you can just type `psql`: |
|
| 57 | + psql -d postgres -c "create database ${USER}" |
|
| 58 | + |
|
| 59 | + brew install postgis |
|
| 60 | + |
|
| 61 | +### Install the extensions |
|
| 62 | + |
|
| 63 | +For the official instructions, see <http://postgis.net/docs/postgis_installation.html#install_tiger_geocoder_extension> |
|
| 64 | + |
|
| 65 | + # Define a name for your database |
|
| 66 | + export DB=icisstaff_development |
|
| 67 | + |
|
| 68 | + psql -d $DB |
|
| 69 | + create extension postgis; |
|
| 70 | + create extension postgis_topology; |
|
| 71 | + create extension fuzzystrmatch; |
|
| 72 | + create extension address_standardizer; |
|
| 73 | + create extension postgis_tiger_geocoder; |
|
| 74 | + |
|
| 75 | +### Verify the address standardizer |
|
| 76 | + |
|
| 77 | + select num, street, city, state, zip |
|
| 78 | + from parse_address('1 Devonshire Place, Boston, MA 02109'); |
|
| 79 | + |
|
| 80 | +should produce a breakdown of street, city, state, zip: |
|
| 81 | + |
|
| 82 | +| num | street | city | state | zip | |
|
| 83 | +|-----|------------------|--------|-------|-------| |
|
| 84 | +| 1 | Devonshire Place | Boston | MA | 02109 | |
|
| 85 | + |
|
| 86 | +### Should also be able to . . . |
|
| 87 | + |
|
| 88 | + select na.address, na.streetname,na.streettypeabbrev, na.zip |
|
| 89 | + from normalize_address('1 Devonshire Place, Boston, MA 02109') as na; |
|
| 90 | + |
|
| 91 | +which produces: |
|
| 92 | + |
|
| 93 | +| address | streetname | streettypeabbrev | zip | |
|
| 94 | +|---------|------------|--------------------|-------| |
|
| 95 | +| 1 | Devonshire | Pl | 02109 | |
|
| 96 | + |
|
| 97 | +## Load from the Census |
|
| 98 | + |
|
| 99 | +### Set up your defaults for generated loader scripts |
|
| 100 | + |
|
| 101 | +Make a directory for your Tiger "staging" data (the raw files downloaded |
|
| 102 | +from the Census): |
|
| 103 | + |
|
| 104 | + mkdir -p $HOME/gisdata/temp |
|
| 105 | + |
|
| 106 | +When the loader scripts are generated, Tiger consults the |
|
| 107 | +`tiger.loader_variables` and `tiger.loader_platform` tables. First set up |
|
| 108 | +`tiger.loader_variables` so it knows about the "staging" folder: |
|
| 109 | + |
|
| 110 | + psql -d $DB -c "update tiger.loader_variables set staging_fold = '$HOME/gisdata'" |
|
| 111 | + |
|
| 112 | +Now you will create a row in `tiger.loader_platform` that fits better with |
|
| 113 | +the Brew PostgreSQL setup. The key for these rows is called `os` -- create |
|
| 114 | +one where `os` is the name of the database. |
|
| 115 | + |
|
| 116 | + psql -d $DB -X -c "delete from tiger.loader_platform where os = '$DB'" |
|
| 117 | + psql -d $DB <<EOF |
|
| 118 | + insert into tiger.loader_platform(os, declare_sect, pgbin, wget, unzip_command, psql, path_sep, |
|
| 119 | + loader, environ_set_command, county_process_command) |
|
| 120 | + select '$DB', |
|
| 121 | + E'TMPDIR="\${staging_fold}/temp/"\nUNZIPTOOL=unzip\nWGETTOOL=wget\nPSQL=\'/usr/local/bin/psql -d ${DB} \'\nSHP2PGSQL=shp2pgsql\ncd \${staging_fold}\n', |
|
| 122 | + pgbin, wget, unzip_command, psql, path_sep, |
|
| 123 | + loader, environ_set_command, county_process_command |
|
| 124 | + FROM tiger.loader_platform |
|
| 125 | + WHERE os = 'sh'; |
|
| 126 | + EOF |
|
| 127 | + |
|
| 128 | +### Generate the scripts to load the geocoder data |
|
| 129 | + |
|
| 130 | +Generate the national data, on which state data depends: |
|
| 131 | + |
|
| 132 | + psql -d $DB -X -c "SELECT Loader_Generate_Nation_Script('$DB')" -tA > $HOME/gisdata/nation_script_load.sh |
|
| 133 | + |
|
| 134 | +Generate the data for the states you want -- here, Massachusetts and Minnesota: |
|
| 135 | + |
|
| 136 | + psql -d $DB -X -c "SELECT Loader_Generate_Script(ARRAY['MA', 'MN'], '$DB')" -tA > $HOME/gisdata/ma_mn_states_load.sh |
|
| 137 | + |
|
| 138 | +It is tempting to try to do this for all states . . . |
|
| 139 | + |
|
| 140 | + psql -d $DB -X -c "SELECT Loader_Generate_Script(ARRAY['AL' , 'AK' , 'AS' , 'AZ' , 'AR' , 'CA' , 'CO' , 'MP' , 'CT' , 'DE' , 'DC' , 'FL' , 'GA' , 'GU' , 'HI' , 'ID' , 'IL' , 'IN' , 'IA' , 'KS' , 'KY' , 'LA' , 'ME' , 'MD' , 'MA' , 'MI' , 'MN' , 'MS' , 'MO' , 'MT' , 'NE' , 'NV' , 'NH' , 'NJ' , 'NM' , 'NY' , 'NC' , 'ND' , 'OH' , 'OK' , 'OR' , 'PA' , 'PR' , 'RI' , 'SC' , 'SD' , 'TN' , 'TX' , 'VI' , 'UT' , 'VT' , 'VA' , 'WA' , 'WV' , 'WI' , 'WY'], '$DB')" -tA > $HOME/gisdata/all_states_load.sh |
|
| 141 | + |
|
| 142 | +But experience suggests that you should load each state separately, so that if |
|
| 143 | +you have to, you can re-run a state: |
|
| 144 | + |
|
| 145 | + states=(AL AK AS AZ AR CA CO MP CT DE DC FL GA GU HI ID IL IN IA KS KY LA ME MD MA MI MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA PR RI SC SD TN TX VI UT VT VA WA WV WI WY) |
|
| 146 | + for state in "${states[@]}" |
|
| 147 | + do |
|
| 148 | + psql -d $DB -X -c "SELECT Loader_Generate_Script(ARRAY['${state}'], '$DB')" -tA > $HOME/gisdata/${state}_state_load.sh |
|
| 149 | + done |
|
| 150 | + |
|
| 151 | +Make the scripts runnable . . . |
|
| 152 | + |
|
| 153 | + chmod +x $HOME/gisdata/*.sh |
|
| 154 | + |
|
| 155 | +### Generate |
|
| 156 | + |
|
| 157 | +Run the loader scripts. |
|
| 158 | + |
|
| 159 | + cd $HOME/gisdata |
|
| 160 | + ./nation_script_load.sh |
|
| 161 | + ./AZ_state_load.sh |
|
| 162 | + ./MN_state_load.sh |
|
| 163 | + # etc. |
|
| 164 | + |
|
| 165 | +### Now find lat/long for an address you know in one of the states |
|
| 166 | + |
|
| 167 | + select |
|
| 168 | + g.rating, |
|
| 169 | + st_y(g.geomout) as lat, |
|
| 170 | + st_x(g.geomout) as lon, |
|
| 171 | + (addy).address as stno, |
|
| 172 | + (addy).streetname as street, |
|
| 173 | + (addy).streettypeabbrev as styp, |
|
| 174 | + (addy).location as city, |
|
| 175 | + (addy).stateabbrev as st, |
|
| 176 | + (addy).zip |
|
| 177 | + from geocode(normalize_address('931 Portland Ave, Saint Paul, MN'), 1) as g; |
|
| 178 | + |
|
| 179 | +If the data load has run correctly, you should see: |
|
| 180 | + |
|
| 181 | +| rating | lat | lon | stno | street | styp | city | st | zip | |
|
| 182 | +|--------|------------------|-------------------|------|----------|------|---------|----|-------| |
|
| 183 | +| 9 | 44.9429540910341 | -93.1394249137542 | 931 | Portland | Ave | St. Paul| MN | 55104 | |
|
| 184 | + |
|
| 185 | +## Sample queries |
|
| 186 | + |
|
| 187 | +Here are a few ways to leverage the geocoding. |
|
| 188 | + |
|
| 189 | +### Basics |
|
| 190 | + |
|
| 191 | +The key functions are |
|
| 192 | + |
|
| 193 | +* `normalize_address` - <http://postgis.net/docs/Normalize_Address.html> - |
|
| 194 | + Converts a messy address to a uniform one broken down into its parts; |
|
| 195 | + can then be used as input to geocode. NOTE: Does not require the downloaded |
|
| 196 | + Census data. |
|
| 197 | +* `geocode` - <http://postgis.net/docs/Geocode.html> - Converts an address |
|
| 198 | + to mapping data. Can take as input a string or the result from |
|
| 199 | + `normalize_address`. One thing I don't discuss here is that it is possible |
|
| 200 | + to constrain the results with a "geometry filter," so that if the input |
|
| 201 | + string does not have city and state, you can make the lookup happen within |
|
| 202 | + some city and state. |
|
| 203 | + * `st_x` and `st_y` - <http://postgis.net/docs/ST_X.html> - Get latitude or |
|
| 204 | + longitude from "point" data. (These are actually from postGIS, |
|
| 205 | + not the Tiger extension.) |
|
| 206 | +* `pprint_addy` - <http://postgis.net/docs/Pprint_Addy.html> - Pretty print |
|
| 207 | + an address from `normalize_address`. Does not require the downloaded Census data. |
|
| 208 | +* `reverse_geocode` - <http://postgis.net/docs/Reverse_Geocode.html> - Find |
|
| 209 | + an address for a lat/long location. (Not discussed here.) |
|
| 210 | +* `st_distance_sphere` - <http://postgis.net/docs/manual-1.5/ST_Distance_Sphere.html> - Compute distance as the bird flies. (Not discussed here.) |
|
| 211 | + |
|
| 212 | +The following examples use a sample table like this: |
|
| 213 | + |
|
| 214 | + drop table if exists people; |
|
| 215 | + create table people (pid int, first text, last text, address text); |
|
| 216 | + insert into people (pid, first, last, address) values (1, 'John', 'Norman', '931 Portland, Saint Paul, MN'); |
|
| 217 | + insert into people (pid, first, last, address) values (2, 'Idan', 'Ben-Arieh', '6 valley rd, Natick MA 01760'); |
|
| 218 | + insert into people (pid, first, last, address) values (3, 'Jane', 'Doe', '3048 W San Juan Drive, Tucson AZ 85713'); |
|
| 219 | + insert into people (pid, first, last, address) values (4, 'Doug', 'Doucey', '400 W Girley, Prescott, AZ'); |
|
| 220 | + |
|
| 221 | +First off, many of the PostGIS functions return PostgreSQL "composite types": |
|
| 222 | +to destructure, you enclose the returned composite in parentheses, like so: |
|
| 223 | + |
|
| 224 | + select |
|
| 225 | + address, |
|
| 226 | + (normalize_address(address)).streetname, |
|
| 227 | + (normalize_address(address)).zip |
|
| 228 | + from people; |
|
| 229 | + |
|
| 230 | +Notice that the `normalize_address` function is being called twice. Ideally |
|
| 231 | +we would like to call it once. A common pattern you will see is some kind |
|
| 232 | +of renamed query result or subquery: |
|
| 233 | + |
|
| 234 | + select address, (na).streetname, (na).zip from |
|
| 235 | + (select |
|
| 236 | + address, |
|
| 237 | + normalize_address(address) as na |
|
| 238 | + from people) people_with_normalized_address; |
|
| 239 | + |
|
| 240 | +Another example of destructuring, with geocoding: |
|
| 241 | + |
|
| 242 | + select |
|
| 243 | + address, |
|
| 244 | + st_y((g).geomout) as latitude, |
|
| 245 | + st_x((g).geomout) as longitude, |
|
| 246 | + (g).rating, |
|
| 247 | + pprint_addy((g).addy) as normalized_address |
|
| 248 | + from |
|
| 249 | + (select |
|
| 250 | + address, |
|
| 251 | + geocode(normalize_address(address),1) as g |
|
| 252 | + from people) people_with_geocoded_address; |
|
| 253 | + |
|
| 254 | +### Select the original data, augmenting with additional geo columns |
|
| 255 | + |
|
| 256 | +Aside: I imported this data from csv with a neat tool called `pgfutter`: <https://github.com/lukasmartinelli/pgfutter>. |
|
| 257 | + |
|
| 258 | +Here the table with the original data is `import.patients` |
|
| 259 | +with a primary/unique key of `patient_demographics_patient_guid` and the original address in `patient_demographics_address`. |
|
| 260 | + |
|
| 261 | + select import.patients.*, |
|
| 262 | + geocoded.latitude as Latitude, |
|
| 263 | + geocoded.longitude as Longitude, |
|
| 264 | + geocoded.rating as "Geocode Rating", |
|
| 265 | + geocoded.normalized_address as "Normalized Address" |
|
| 266 | + from |
|
| 267 | + import.patients |
|
| 268 | + left join |
|
| 269 | + (select |
|
| 270 | + patient_demographics_patient_guid, |
|
| 271 | + st_y((geo).geomout) as latitude, |
|
| 272 | + st_x((geo).geomout) as longitude, |
|
| 273 | + (geo).rating, |
|
| 274 | + pprint_addy((geo).addy) as normalized_address |
|
| 275 | + from (select |
|
| 276 | + patient_demographics_patient_guid, |
|
| 277 | + (geocode(normalize_address(patient_demographics_address),1)) as geo |
|
| 278 | + from import.patients) addresses) geocoded |
|
| 279 | + on geocoded.patient_demographics_patient_guid = import.patients.patient_demographics_patient_guid; |
|
| 280 | + |
|
| 281 | +(On my 2014-era MBP, this takes about 0.16 seconds / row; about 015 seconds |
|
| 282 | +with an index on `patient_demographics_patient_guid` -- `create index on import.patients (patient_demographics_patient_guid);`.) |
|
| 283 | + |
|
| 284 | +### Add columns for geocoding, and only geocode when never geocoded |
|
| 285 | + |
|
| 286 | +Finally, in this example, we add columns to a table without geocoding. |
|
| 287 | +Then we only geocode rows where the `rating` column is null. This would be |
|
| 288 | +a nice overnight task to get geocoded all new rows that have never been |
|
| 289 | +geocoded. |
|
| 290 | + |
|
| 291 | + alter table import.patients |
|
| 292 | + add column latitude double precision, |
|
| 293 | + add column longitude double precision, |
|
| 294 | + add column rating integer, |
|
| 295 | + add column normalized_address varchar; |
|
| 296 | + |
|
| 297 | +(To get the data types, I used the `pg_typeof` column on a query that |
|
| 298 | +returned the different items.) |
|
| 299 | + |
|
| 300 | +And here's the query (this is verbatim from the documentation). |
|
| 301 | + |
|
| 302 | +Note the use of `index 3` -- this is worth eyeballing before running on all |
|
| 303 | +rows. |
|
| 304 | + |
|
| 305 | + update import.patients |
|
| 306 | + set (latitude, longitude, rating, normalized_address) |
|
| 307 | + = ( |
|
| 308 | + st_y((g.geo).geomout), |
|
| 309 | + st_x((g.geo).geomout), |
|
| 310 | + coalesce((g.geo).rating,-1), |
|
| 311 | + pprint_addy((g.geo).addy) |
|
| 312 | + ) |
|
| 313 | + from (select patient_demographics_patient_guid |
|
| 314 | + from import.patients |
|
| 315 | + where rating is null order by patient_demographics_patient_guid limit 3) as a |
|
| 316 | + left join (select patient_demographics_patient_guid, (geocode(patient_demographics_address,1)) as geo |
|
| 317 | + from import.patients As ag |
|
| 318 | + where ag.rating is null order by patient_demographics_patient_guid limit 3) As g on a.patient_demographics_patient_guid = g.patient_demographics_patient_guid |
|
| 319 | + where a.patient_demographics_patient_guid = import.patients.patient_demographics_patient_guid; |
|
| 320 | + |
|
| 321 | +(About 0.20 seconds/row without indexing.) |
|
| 322 | + |
|
| 323 | +----------- |
|
| 324 | + |
|
| 325 | +### Visualization |
|
| 326 | + |
|
| 327 | +Here's a quick way to get a visualization on your laptop. |
|
| 328 | + |
|
| 329 | +First, install R. |
|
| 330 | + |
|
| 331 | + brew tap homebrew/science |
|
| 332 | + brew install R |
|
| 333 | + |
|
| 334 | +Export your geocoded table to a CSV. |
|
| 335 | + |
|
| 336 | + psql -c "copy import.patients to '$HOME/Desktop/patients.csv' with (format csv, header)" |
|
| 337 | + |
|
| 338 | +In R, |
|
| 339 | + |
|
| 340 | + install.packages('ggmap') |
|
| 341 | + library(ggmap) |
|
| 342 | + patients <- read.csv(file="/Users/jgn/Desktop/patients.csv", header=TRUE) |
|
| 343 | + |
|
| 344 | +The fastest way to get started with `ggmap` is to use the `qmplot` function. |
|
| 345 | +My advice is to read the docs and play around. |
|
| 346 | + |
|
| 347 | + qmplot(longitude, latitude, data=patients, source='stamen') |
|
| 348 | + qmplot(longitude, latitude, data=patients, source='google', maptype = 'roadmap') |
|
| 349 | + qmplot(longitude, latitude, data=patients, source='google', maptype = 'roadmap', zoom=10) |
|
| 350 | + qmplot(longitude, latitude, data=patients, source='google', maptype = 'roadmap', geom = 'density2d') |
|
| 351 | + qmplot(longitude, latitude, data=patients, source='google', maptype = 'roadmap', alpha=I(0.5), zoom=10, geom='density2d') |