11f8d2f868e150b190ce2d23f16f3417355da229
public/sql.md
| ... | ... | @@ -0,0 +1,698 @@ |
| 1 | +# SQL / Relational |
|
| 2 | + |
|
| 3 | +<style type="text/css"> |
|
| 4 | +div .venn { |
|
| 5 | + float: left; |
|
| 6 | + width: 300px; |
|
| 7 | + height: 300px; |
|
| 8 | + border-radius: 150px; |
|
| 9 | + border: 3px solid #000; |
|
| 10 | + color: #000; |
|
| 11 | + text-align: center; |
|
| 12 | + vertical-align: middle; |
|
| 13 | + font-size: 1.0em; |
|
| 14 | + line-height: 300px; |
|
| 15 | + font-weight: bold; |
|
| 16 | + font-family: sans-serif; |
|
| 17 | + position: relative; |
|
| 18 | +} |
|
| 19 | +.inner_one { background-color: yellow; opacity: 0.5; } |
|
| 20 | +.inner_two::before{ content: 'id = name_id'; position: absolute; left: 10px } |
|
| 21 | +.inner_two { background-color: red; opacity: 0.5; margin-left: -110px; } |
|
| 22 | + |
|
| 23 | +.left_one { background-color: #ff8040; opacity: 0.5; z-index: 1; } |
|
| 24 | +.left_two { background-color: red; opacity: 0.5; margin-left: -110px; } |
|
| 25 | +</style> |
|
| 26 | + |
|
| 27 | +### [1] Why we (I) love relational databases |
|
| 28 | + |
|
| 29 | +* **ACID** |
|
| 30 | + * Atomicity (transactions [multiple operations can be "all or nothing"] - even with externalities such as power loss) |
|
| 31 | + * Consistency (what's on disk is always acceptable as a state - including constraints, etc.) |
|
| 32 | + * Isolation (no transaction affects another) |
|
| 33 | + * Durability (if an insert completes, it is really written to disk) |
|
| 34 | +* Do we need these? |
|
| 35 | +* What apps don't need these things? |
|
| 36 | + * If Facebook fails to provide you with 1 in 100,000 status updates from your friends, what is the cost? |
|
| 37 | +* Can we get these properties from a database distributed across a network? (CAP Theorem: See <http://www.youtube.com/watch?v=Jw1iFr4v58M> and <http://blog.beany.co.kr/wp-content/uploads/2011/03/nosql_cap.png>) |
|
| 38 | + |
|
| 39 | +### [2] Creating a database and connecting (from shell) |
|
| 40 | + |
|
| 41 | + dropdb example |
|
| 42 | + createdb example |
|
| 43 | + psql example |
|
| 44 | + |
|
| 45 | +### [3] Dropping and creating a table |
|
| 46 | + |
|
| 47 | + drop table names; |
|
| 48 | + create table names ( |
|
| 49 | + id serial, -- "auto increment" |
|
| 50 | + first_name text |
|
| 51 | + ); |
|
| 52 | + |
|
| 53 | +### [4] Another |
|
| 54 | + |
|
| 55 | + drop table books; |
|
| 56 | + create table books ( |
|
| 57 | + id serial, -- "auto increment" |
|
| 58 | + name_id int, |
|
| 59 | + title text |
|
| 60 | + ); |
|
| 61 | + |
|
| 62 | +### [5] Adding data |
|
| 63 | + |
|
| 64 | + delete from names; |
|
| 65 | + insert into names (first_name) values ('John'); |
|
| 66 | + insert into names (first_name) values ('James'); |
|
| 67 | + insert into names (first_name) values ('Tom'); |
|
| 68 | + |
|
| 69 | +* **Tip:** Always give the column names so that you are not dependent on the created order |
|
| 70 | + |
|
| 71 | +### [6] more |
|
| 72 | + |
|
| 73 | + delete from books; |
|
| 74 | + insert into books (name_id, title) values (1, 'The C Programming Language'); |
|
| 75 | + insert into books (name_id, title) values (1, 'The Ruby Programming Language'); |
|
| 76 | + insert into books (name_id, title) values (2, 'Java Concurrency in Practice'); |
|
| 77 | + insert into books (name_id, title) values (null, 'SQL for Smarties'); |
|
| 78 | + -- Notice: We are assuming we know the values for names.id |
|
| 79 | + -- What if we re-ran the inserts for names? |
|
| 80 | + |
|
| 81 | +### [7] query - wildcard for columns |
|
| 82 | + |
|
| 83 | + select * from names; |
|
| 84 | + select * from books; |
|
| 85 | + |
|
| 86 | +### [8] query - rearranging columns and specifying order |
|
| 87 | + |
|
| 88 | + select first_name, id from names; |
|
| 89 | + select name_id, id, title from books; |
|
| 90 | + |
|
| 91 | +### [9] cross join - return "product" (all combinations) of two tables |
|
| 92 | + |
|
| 93 | + select * |
|
| 94 | + from names, books; |
|
| 95 | + -- don't do this with big tables |
|
| 96 | + |
|
| 97 | +### [10] same thing, but putting ids next together |
|
| 98 | + |
|
| 99 | + select names.first_name, names.id as "names.id", |
|
| 100 | + books.name_id as "books.name_id", books.id as "books.id", books.title |
|
| 101 | + from names, books |
|
| 102 | + order by names.id; |
|
| 103 | + -- don't do this with big tables |
|
| 104 | + |
|
| 105 | +### [11] inner join - Venn diagram |
|
| 106 | + |
|
| 107 | +--- |
|
| 108 | + |
|
| 109 | +<b>Inner join: What we want is represented by the <span style="background-color: #ff8040;">orange</span> section</b> |
|
| 110 | +<br/> |
|
| 111 | + |
|
| 112 | +<div class="venn inner_one">Names</div> |
|
| 113 | +<div class="venn inner_two">Books</div> |
|
| 114 | +<div style="clear: both;"/> |
|
| 115 | + |
|
| 116 | +<div> |
|
| 117 | +<div style="float: left;"> |
|
| 118 | +<b>names</b> |
|
| 119 | +<table> |
|
| 120 | + <tr><th>first_name</th><th>id</th></tr> |
|
| 121 | + <tr><td>John</td><td style="background-color: yellow; opacity: 0.5;">1</td></tr> |
|
| 122 | + <tr><td>James</td><td style="background-color: yellow; opacity: 0.5;">2</td></tr> |
|
| 123 | + <tr><td>Tom</td><td style="background-color: yellow; opacity: 0.5;">3</td></tr> |
|
| 124 | +</table> |
|
| 125 | +</div> |
|
| 126 | + |
|
| 127 | +<div style="float: right;"> |
|
| 128 | +<b>books</b> |
|
| 129 | +<table> |
|
| 130 | + <tr><th>name_id</th><th>id</th><th>title</th></tr> |
|
| 131 | + <tr><td style="background-color: yellow; opacity: 0.5;">1</td><td style="background-color: red; opacity: 0.5;">1</td><td>The C Programming Language</td></tr> |
|
| 132 | + <tr><td style="background-color: yellow; opacity: 0.5;">1</td><td style="background-color: red; opacity: 0.5;">2</td><td>The Ruby Programming Language</td></tr> |
|
| 133 | + <tr><td style="background-color: yellow; opacity: 0.5;">2</td><td style="background-color: red; opacity: 0.5;">3</td><td>Java Concurrency in Practice</td></tr> |
|
| 134 | + <tr><td>(null)</td><td style="background-color: red; opacity: 0.5;">4</td><td>SQL for Smarties</td></tr> |
|
| 135 | +</table> |
|
| 136 | +</div> |
|
| 137 | +</div> |
|
| 138 | +<div style="clear: both;"/> |
|
| 139 | +<br/> |
|
| 140 | + |
|
| 141 | +--- |
|
| 142 | + |
|
| 143 | +### [12] inner join - SQL - "Give me data from rows from both tables where the condition is true" |
|
| 144 | + |
|
| 145 | + -- "explicit join notation" |
|
| 146 | + select first_name, title |
|
| 147 | + from books join names |
|
| 148 | + on names.id = books.name_id; |
|
| 149 | + |
|
| 150 | + -- "implicit join notation" |
|
| 151 | + select first_name, title |
|
| 152 | + from books, names |
|
| 153 | + where names.id = books.name_id; |
|
| 154 | + |
|
| 155 | + -- Same as "product" join above, but using where clause |
|
| 156 | + select names.first_name, names.id as "names.id", |
|
| 157 | + books.name_id as "books.name_id", books.id as "books.id", books.title |
|
| 158 | + from names, books |
|
| 159 | + where names.id = books.name_id |
|
| 160 | + order by names.id; |
|
| 161 | + |
|
| 162 | +### [13] left join - Venn diagram |
|
| 163 | + |
|
| 164 | +--- |
|
| 165 | + |
|
| 166 | +<b>Left join: What we want is represented by the <span style="background-color: #ffc0a0;">salmon</span> and <span style="background-color: #ff8040;">orange</span> sections</b> |
|
| 167 | +<br/> |
|
| 168 | + |
|
| 169 | +<div class="venn left_one">Names</div> |
|
| 170 | +<div class="venn left_two">Books</div> |
|
| 171 | +<div style="clear: both;"/> |
|
| 172 | + |
|
| 173 | +<div> |
|
| 174 | +<div style="float: left;"> |
|
| 175 | +<b>names</b> |
|
| 176 | +<table> |
|
| 177 | + <tr><th>first_name</th><th>id</th></tr> |
|
| 178 | + <tr><td>John</td><td style="background-color: #ffc0a0;">1</td></tr> |
|
| 179 | + <tr><td>James</td><td style="background-color: #ffc0a0;">2</td></tr> |
|
| 180 | + <tr><td>Tom</td><td style="background-color: #ffc0a0;">3</td></tr> |
|
| 181 | +</table> |
|
| 182 | +</div> |
|
| 183 | + |
|
| 184 | +<div style="float: right;"> |
|
| 185 | +<b>books</b> |
|
| 186 | +<table> |
|
| 187 | + <tr><th>name_id</th><th>id</th><th>title</th></tr> |
|
| 188 | + <tr><td style="background-color: #ffc0a0;">1</td><td style="background-color: red; opacity: 0.5;">1</td><td>The C Programming Language</td></tr> |
|
| 189 | + <tr><td style="background-color: #ffc0a0;">1</td><td style="background-color: red; opacity: 0.5;">2</td><td>The Ruby Programming Language</td></tr> |
|
| 190 | + <tr><td style="background-color: #ffc0a0;">2</td><td style="background-color: red; opacity: 0.5;">3</td><td>Java Concurrency in Practice</td></tr> |
|
| 191 | + <tr><td>(null)</td><td style="background-color: red; opacity: 0.5;">4</td><td>SQL for Smarties</td></tr> |
|
| 192 | +</table> |
|
| 193 | +</div> |
|
| 194 | +</div> |
|
| 195 | +<div style="clear: both;"/> |
|
| 196 | +<br/> |
|
| 197 | + |
|
| 198 | +--- |
|
| 199 | + |
|
| 200 | +### [14] left join - SQL - "Give me data for 'left' table, and data the 'right' table when the condition is true, but nulls for the 'right' table when the condition is false" |
|
| 201 | + |
|
| 202 | + select names.first_name, books.title |
|
| 203 | + from names left join books |
|
| 204 | + on books.name_id = names.id; |
|
| 205 | + |
|
| 206 | + first_name | title |
|
| 207 | + ------------+------------------------------- |
|
| 208 | + John | The C Programming Language |
|
| 209 | + John | The Ruby Programming Language |
|
| 210 | + James | Java Concurrency in Practice |
|
| 211 | + Tom | |
|
| 212 | + (4 rows) |
|
| 213 | + |
|
| 214 | +### [15] left join is dominate pattern in ActiveRecord . . . |
|
| 215 | + |
|
| 216 | +* why? |
|
| 217 | + |
|
| 218 | +### [16] left join is dominate pattern in ActiveRecord |
|
| 219 | + |
|
| 220 | +* when we send an association message to a relation, we expect that association to be applied to each element |
|
| 221 | +* Note, however, that ActiveRecord avoids real left joins |
|
| 222 | + |
|
| 223 | +### [17] Now that you're so smart |
|
| 224 | + |
|
| 225 | +* How many rows will each of the following queries return? |
|
| 226 | + |
|
| 227 | + select * from names left join books on books.name_id = names.id; |
|
| 228 | + select * from names right join books on books.name_id = names.id; |
|
| 229 | + select * from books left join names on books.name_id = names.id; |
|
| 230 | + select * from books right join names on books.name_id = names.id; |
|
| 231 | + |
|
| 232 | +### [18] demo - study the log |
|
| 233 | + |
|
| 234 | + irb |
|
| 235 | + require 'active_record' |
|
| 236 | + require 'pg' |
|
| 237 | + require 'logger' |
|
| 238 | + B = ActiveRecord::Base |
|
| 239 | + B.logger = Logger.new(STDOUT) |
|
| 240 | + M = ActiveRecord::Migration |
|
| 241 | + |
|
| 242 | + spec = { adapter: :postgresql, database: :ex } |
|
| 243 | + B.establish_connection(adapter: :postgresql, database: :postgres, schema_search_path: :public) |
|
| 244 | + B.connection.drop_database spec[:database] |
|
| 245 | + B.connection.create_database spec[:database] |
|
| 246 | + B.establish_connection spec |
|
| 247 | + |
|
| 248 | + B.mass_assignment_sanitizer = :logger |
|
| 249 | + CN = Class.new(M) { def change; create_table :names do |t| t.text :first_name end; end } |
|
| 250 | + CB = Class.new(M) { def change; create_table :books do |t| t.text :title; t.references :name; end; end } |
|
| 251 | + CN.migrate(:up) |
|
| 252 | + CB.migrate(:up) |
|
| 253 | + Name = Class.new(B) { has_many :books } |
|
| 254 | + Book = Class.new(B) { belongs_to :name } |
|
| 255 | + |
|
| 256 | + john = Name.create! first_name: 'John' |
|
| 257 | + james = Name.create! first_name: 'James' |
|
| 258 | + tom = Name.create! first_name: 'Tom' |
|
| 259 | + c = Book.create! name: john, title: 'The C Programming Language' |
|
| 260 | + ruby = Book.create! name: john, title: 'The Ruby Programming Language' |
|
| 261 | + java = Book.create! name: james, title: 'Java Concurrency in Practice' |
|
| 262 | + |
|
| 263 | + # Let's discuss what we see in the log |
|
| 264 | + |
|
| 265 | + Name.all(include: :books) |
|
| 266 | + Name.all(joins: :books) |
|
| 267 | + Name.all(joins: 'left join books on books.name_id = names.id') |
|
| 268 | + Name.all(joins: 'left join books on books.name_id = names.id', include: :books) |
|
| 269 | + |
|
| 270 | +### [19] obviously not the way we would represent book ownership for realsies |
|
| 271 | + |
|
| 272 | +* We would use a join table in between names and books - Now a book can be owned by more than one person (but notice: this is arguably the wrong modeling) |
|
| 273 | + |
|
| 274 | +<div> |
|
| 275 | +<div style="float: left;"> |
|
| 276 | +<b>names</b> |
|
| 277 | +<table> |
|
| 278 | + <tr><th>first_name</th><th>id</th></tr> |
|
| 279 | + <tr><td>John</td><td style="background-color: #ffc0a0;">1</td></tr> |
|
| 280 | + <tr><td>James</td><td style="background-color: #ffc0a0;">2</td></tr> |
|
| 281 | + <tr><td>Tom</td><td style="background-color: #ffc0a0;">3</td></tr> |
|
| 282 | + <tr><td>Patrick</td><td style="background-color: #ffc0a0;">4</td></tr> |
|
| 283 | +</table> |
|
| 284 | +</div> |
|
| 285 | + |
|
| 286 | +<div style="float: left; padding-left: 100px;"> |
|
| 287 | + <b>names_books</b> |
|
| 288 | + <table> |
|
| 289 | + <tr><th>name_id</th><th>book_id</th></tr> |
|
| 290 | + <tr><td style="background-color: #ffc0a0;">1</td><td style="background-color: red; opacity: 0.5;">1</td></tr> |
|
| 291 | + <tr><td style="background-color: #ffc0a0;">1</td><td style="background-color: red; opacity: 0.5;">2</td></tr> |
|
| 292 | + <tr><td style="background-color: #ffc0a0;">1</td><td style="background-color: red; opacity: 0.5;">4</td></tr> |
|
| 293 | + <tr><td style="background-color: #ffc0a0;">2</td><td style="background-color: red; opacity: 0.5;">3</td></tr> |
|
| 294 | + <tr><td style="background-color: #ffc0a0;">3</td><td style="background-color: red; opacity: 0.5;">4</td></tr> |
|
| 295 | + </table> |
|
| 296 | +</div> |
|
| 297 | + |
|
| 298 | +<div style="float: right;"> |
|
| 299 | +<b>books</b> |
|
| 300 | +<table> |
|
| 301 | + <tr><th>id</th><th>title</th></tr> |
|
| 302 | + <tr><td style="background-color: red; opacity: 0.5;">1</td><td>The C Programming Language</td></tr> |
|
| 303 | + <tr><td style="background-color: red; opacity: 0.5;">2</td><td>The Ruby Programming Language</td></tr> |
|
| 304 | + <tr><td style="background-color: red; opacity: 0.5;">3</td><td>Java Concurrency in Practice</td></tr> |
|
| 305 | + <tr><td style="background-color: red; opacity: 0.5;">4</td><td>SQL for Smarties</td></tr> |
|
| 306 | + <tr><td style="background-color: red; opacity: 0.5;">5</td><td>Specification by Example</td></tr> |
|
| 307 | +</table> |
|
| 308 | +</div> |
|
| 309 | +</div> |
|
| 310 | +<div style="clear: both;"/> |
|
| 311 | +<br/> |
|
| 312 | + |
|
| 313 | +* Names and ownership |
|
| 314 | + |
|
| 315 | + select names.first_name, books.title |
|
| 316 | + from names, books, names_books |
|
| 317 | + where names.id = names_books.name_id |
|
| 318 | + and books.id = names_books.book_id; |
|
| 319 | + |
|
| 320 | +### [20] explain plan |
|
| 321 | + |
|
| 322 | +* table with slow query |
|
| 323 | + |
|
| 324 | + drop table example; |
|
| 325 | + create table example |
|
| 326 | + as select generate_series(1, 1000000) as id, |
|
| 327 | + 'some message'::varchar(100) as message; |
|
| 328 | + insert into example values (1000001, 'different message'); |
|
| 329 | + select * from example where message = 'different message'; -- 250ms |
|
| 330 | + |
|
| 331 | +* see the query plan |
|
| 332 | + |
|
| 333 | + explain select * from example where message = 'different message'; |
|
| 334 | + |
|
| 335 | +* add an index, re-run query, and check plan again |
|
| 336 | + |
|
| 337 | + create index on example (message); |
|
| 338 | + select * from example where message = 'different message'; --16ms |
|
| 339 | + explain select * from example where message = 'different message'; |
|
| 340 | + |
|
| 341 | +### [21] Aggregation and group by, having, etc. |
|
| 342 | + |
|
| 343 | +* `group by` provides for a means of showing the distinct values for a column |
|
| 344 | + |
|
| 345 | + select message from example group by message; |
|
| 346 | + -- compare: |
|
| 347 | + -- select distinct message from example; |
|
| 348 | + |
|
| 349 | +* But even better, we can aggregate according to the groups |
|
| 350 | + |
|
| 351 | + select message, count(*) from example group by message; |
|
| 352 | + |
|
| 353 | +* But even mo' better, we can as for groups `having` certain characteristics |
|
| 354 | + |
|
| 355 | + select message, count(*) from example group by message having count(*) = 1; |
|
| 356 | + |
|
| 357 | +### [22] Subquery expressions |
|
| 358 | + |
|
| 359 | +* You want a predicate that checks another table |
|
| 360 | +* Example: find all patients with an appointment |
|
| 361 | + |
|
| 362 | + select count(*) from patients |
|
| 363 | + where exists ( |
|
| 364 | + select 1 |
|
| 365 | + from appointment_storages |
|
| 366 | + where appointment_storages.patient_id = patients.id |
|
| 367 | + ); |
|
| 368 | + |
|
| 369 | +* Other subquery expressions: `in`, `not in`, `any`; see docs (<http://www.postgresql.org/docs/9.2/static/functions-subquery.html>) |
|
| 370 | +* Why? Expressive. Compare this wrong query trying to do the same thing |
|
| 371 | + |
|
| 372 | + select count(*) |
|
| 373 | + from patients, appointment_storages |
|
| 374 | + where appointment_storages.patient_id = patients.id; |
|
| 375 | + |
|
| 376 | +* And compare this correct query trying to do the same thing |
|
| 377 | + |
|
| 378 | + select count(distinct patients.id) |
|
| 379 | + from patients, appointment_storages |
|
| 380 | + where appointment_storages.patient_id = patients.id; |
|
| 381 | + |
|
| 382 | +### [23] More about explain |
|
| 383 | + |
|
| 384 | +* Can see cost estimates; first value is startup, second is doing the work |
|
| 385 | + |
|
| 386 | + explain select count(*) from patients |
|
| 387 | + where exists ( |
|
| 388 | + select 1 |
|
| 389 | + from appointment_storages |
|
| 390 | + where appointment_storages.patient_id = patients.id |
|
| 391 | + ); |
|
| 392 | + |
|
| 393 | +* Can also run (with `analyze`) to see actuals vs. estimates |
|
| 394 | + |
|
| 395 | + explain analyze select count(*) from patients |
|
| 396 | + where exists ( |
|
| 397 | + select 1 |
|
| 398 | + from appointment_storages |
|
| 399 | + where appointment_storages.patient_id = patients.id |
|
| 400 | + ); |
|
| 401 | + |
|
| 402 | +* Hey, why is explain telling me it's going to do a sequential scan even though I have an index? |
|
| 403 | +* pgadmin has a visual explainer (demo) |
|
| 404 | +* Recommended: <http://wiki.postgresql.org/images/4/45/Explaining_EXPLAIN.pdf> |
|
| 405 | +* Want more? See _PostgreSQL 9.0 High Performance_ (in Engineering library) |
|
| 406 | + |
|
| 407 | +### [24] self-joins |
|
| 408 | + |
|
| 409 | +* The classic problem: Find all instances of a staff member creating a task before another is completed. |
|
| 410 | +* Always explore your data first **<-- best practice** |
|
| 411 | +* First find a user with not too many tasks |
|
| 412 | + |
|
| 413 | + select users.id, users.email, practices.name, count(tasks.*) |
|
| 414 | + from users, tasks, practices |
|
| 415 | + where tasks.created_by_id = users.id |
|
| 416 | + and practices.id = users.practice_id |
|
| 417 | + group by users.id, users.email, practices.name |
|
| 418 | + having count(tasks.*) < 50 |
|
| 419 | + order by count(tasks.*) desc; |
|
| 420 | + |
|
| 421 | +* Look at tasks for such a user to get a feel for the data |
|
| 422 | + |
|
| 423 | + select tasks.title, tasks.created_at, tasks.completed_at |
|
| 424 | + from tasks, users |
|
| 425 | + where tasks.created_by_id = users.id |
|
| 426 | + and users.id = 66 |
|
| 427 | + order by created_at asc; |
|
| 428 | + |
|
| 429 | +* now join tasks with tasks |
|
| 430 | + |
|
| 431 | + select a.title || ' (' || a.id || ') overlaps ' || b.title || ' (' || b.id || ')' |
|
| 432 | + from tasks a, tasks b |
|
| 433 | + where a.created_by_id = b.created_by_id |
|
| 434 | + and b.created_at >= a.created_at |
|
| 435 | + and b.created_at <= a.completed_at |
|
| 436 | + and a.id != b.id |
|
| 437 | + and a.created_by_id = 66 |
|
| 438 | + and b.created_by_id = 66 |
|
| 439 | + order by a.id asc; |
|
| 440 | + |
|
| 441 | +* inspect some examples |
|
| 442 | + |
|
| 443 | + select * from tasks where id in (17359, 17401, 17402, 20457); |
|
| 444 | + |
|
| 445 | +### [25] Another: Find the first weight measurement (or null) for every patient |
|
| 446 | + |
|
| 447 | +* Let's get the earliest measured_at for each patient/marker combination |
|
| 448 | + |
|
| 449 | + select patient_id, marker_cname, min(measured_at) as first_measured_at |
|
| 450 | + from measurements |
|
| 451 | + group by patient_id, marker_cname; |
|
| 452 | + |
|
| 453 | +* We are using min() here. Could there be two measurements for the same patient and marker with the same measured_at? |
|
| 454 | + |
|
| 455 | + select m.patient_id, m.marker_cname, m.measured_at, count(*) |
|
| 456 | + from ( |
|
| 457 | + select patient_id, marker_cname, min(measured_at) as first_measured_at |
|
| 458 | + from measurements |
|
| 459 | + group by patient_id, marker_cname) firsts, |
|
| 460 | + measurements m |
|
| 461 | + where firsts.patient_id = m.patient_id |
|
| 462 | + and firsts.marker_cname = m.marker_cname |
|
| 463 | + and firsts.first_measured_at = m.measured_at |
|
| 464 | + group by m.patient_id, m.marker_cname, m.measured_at |
|
| 465 | + having count(*) > 1; |
|
| 466 | + |
|
| 467 | +* Yes. Well that sucks. That might be a bug or deserve a constraint. What we will do is get the first id for the earliest measured_at for each patient/marker combination. |
|
| 468 | + |
|
| 469 | + select m.patient_id, m.marker_cname, min(id) as id |
|
| 470 | + from ( |
|
| 471 | + select patient_id, marker_cname, min(measured_at) as first_measured_at |
|
| 472 | + from measurements |
|
| 473 | + group by patient_id, marker_cname) firsts, |
|
| 474 | + measurements m |
|
| 475 | + where firsts.patient_id = m.patient_id |
|
| 476 | + and firsts.marker_cname = m.marker_cname |
|
| 477 | + and firsts.first_measured_at = m.measured_at |
|
| 478 | + group by m.patient_id, m.marker_cname; |
|
| 479 | + |
|
| 480 | +* Now this is a very useful set of rows. We can do left joins from the patients table into this and select for various markers; and then join from this table into measurements. We can create a temporary table and direct PG to drop it after a commit. Then we can use and re-use this temporary table in |
|
| 481 | +a series of left joins. |
|
| 482 | + |
|
| 483 | + create temporary table firsts on commit drop as |
|
| 484 | + select m.patient_id, m.marker_cname, min(id) as id |
|
| 485 | + from ( |
|
| 486 | + select patient_id, marker_cname, min(measured_at) as first_measured_at |
|
| 487 | + from measurements |
|
| 488 | + group by patient_id, marker_cname) firsts, |
|
| 489 | + measurements m |
|
| 490 | + where firsts.patient_id = m.patient_id |
|
| 491 | + and firsts.marker_cname = m.marker_cname |
|
| 492 | + and firsts.first_measured_at = m.measured_at |
|
| 493 | + group by m.patient_id, m.marker_cname; |
|
| 494 | + |
|
| 495 | + select p.id, weight, weight_first_measured_at, height, height_first_measured_at, ldl, ldl_first_measured_at |
|
| 496 | + from patients p |
|
| 497 | + left join ( |
|
| 498 | + select f.patient_id, m.value as weight, m.measured_at as weight_first_measured_at |
|
| 499 | + from firsts f, measurements m where f.marker_cname = 'weight' and m.id = f.id) weights |
|
| 500 | + on weights.patient_id = p.id |
|
| 501 | + left join ( |
|
| 502 | + select f.patient_id, m.value as height, m.measured_at as height_first_measured_at |
|
| 503 | + from firsts f, measurements m where f.marker_cname = 'height' and m.id = f.id) heights |
|
| 504 | + on heights.patient_id = p.id |
|
| 505 | + left join ( |
|
| 506 | + select f.patient_id, m.value as ldl, m.measured_at as ldl_first_measured_at |
|
| 507 | + from firsts f, measurements m where f.marker_cname = 'ldl' and m.id = f.id) ldls |
|
| 508 | + on ldls.patient_id = p.id |
|
| 509 | + ; |
|
| 510 | + |
|
| 511 | +### [26] PG "distinct on" |
|
| 512 | + |
|
| 513 | +* When you want the first row in a group, you can use PG's "distinct on" clause (<http://www.postgresql.org/docs/9.2/static/sql-select.html#SQL-DISTINCT>) |
|
| 514 | +* But watch out, this can return surprising results |
|
| 515 | +* People commonly use it to mean: "Give me the 'top' something in this group" -- but what if there is a tie? Example |
|
| 516 | + |
|
| 517 | + drop table if exists employees; |
|
| 518 | + create table employees ( |
|
| 519 | + name varchar(100), |
|
| 520 | + department integer, |
|
| 521 | + salary integer); |
|
| 522 | + insert into employees (name, department, salary) values ('John', 1, 20000); |
|
| 523 | + insert into employees (name, department, salary) values ('Joe', 1, 20000); |
|
| 524 | + insert into employees (name, department, salary) values ('Jill', 2, 40000); |
|
| 525 | + |
|
| 526 | + select department, max(salary) as max_salary |
|
| 527 | + from employees |
|
| 528 | + group by department; |
|
| 529 | + |
|
| 530 | + select e.department, e.name, e.salary |
|
| 531 | + from employees e, |
|
| 532 | + (select department, max(salary) as max_salary |
|
| 533 | + from employees |
|
| 534 | + group by department) maxs |
|
| 535 | + where e.department = maxs.department |
|
| 536 | + and e.salary = maxs.max_salary; |
|
| 537 | + |
|
| 538 | + select distinct on (department) * |
|
| 539 | + from employees |
|
| 540 | + order by department, salary desc; |
|
| 541 | + |
|
| 542 | +* Advice: Learn your SQL according to the standard (see Appendix B in _SQL Cookbook_) |
|
| 543 | + |
|
| 544 | +### [27] What is a PG database? |
|
| 545 | + |
|
| 546 | +* A PG database is that to which you connect |
|
| 547 | +* Got it? You can only query a database to which you are connected |
|
| 548 | +* A database contains schemas which contain tables |
|
| 549 | +* We don't deal with schemas because our tables are in the default schema (`public`) |
|
| 550 | + |
|
| 551 | +### [28] What is a PG schema? |
|
| 552 | + |
|
| 553 | +* Think of a PostgreSQL schema as a prefix to a table name (public.patients) |
|
| 554 | +* A database can have tables in different schemas |
|
| 555 | +* PG will match unprefixed table names according to a search path |
|
| 556 | +* Why do we like it? |
|
| 557 | + * Tables in a schema can be dropped together |
|
| 558 | + * Tables in a schema can be authorized separately |
|
| 559 | + * Tables can have the same name if in different schemas |
|
| 560 | + |
|
| 561 | +### [29] What is a PG tablespace? |
|
| 562 | + |
|
| 563 | +* A PG tablespace is a location in the file system where tables (etc.) go |
|
| 564 | +* Put frequently-queried tables on a fast disk |
|
| 565 | +* Make it easier for backups |
|
| 566 | +* NOTE: In Master/Slave, must have the identical tablespace (location) on both machines |
|
| 567 | + |
|
| 568 | +### [30] When should I add an index? |
|
| 569 | + |
|
| 570 | +* When your query is slow |
|
| 571 | +* What is slow? |
|
| 572 | + * Look at the logs and compare to other queries |
|
| 573 | +* Will an index help me? |
|
| 574 | + * Try `explain analyze` on your query |
|
| 575 | + * Add an index |
|
| 576 | + * Rerun `explain analyze` |
|
| 577 | +* Don't waste your time on this exercise on your development machine (PG will for instance just use sequential access for small data sets, 'cos it's fast); use something closer to production |
|
| 578 | + |
|
| 579 | +### [31] What is a constraint? |
|
| 580 | + |
|
| 581 | +* Check constraint: Disallow insertion of a row where a column value violates a rule (e.g., height > 144 inches) |
|
| 582 | +* Not null constraint: Disallow insertion of a row where a column value is null |
|
| 583 | +* Foreign key constraint: Disallow insertion of a row with foreign key value not found as a primary key in the other table |
|
| 584 | + |
|
| 585 | +### [32] Why don't Rails developers use foreign key constraints? (and other constraints) |
|
| 586 | + |
|
| 587 | +* History: Very difficult to create an ORM that can define constraint types in a database-agnostic way |
|
| 588 | + * Surely you jest? |
|
| 589 | +* History: Lack of clarity around tier responsibility: E.g., who should validate? |
|
| 590 | +* History: Inexperience with transactions |
|
| 591 | +* History: Probably not important in Basecamp (except for billing) |
|
| 592 | +* History: There was a time when people liked polymorphic associations in ActiveRecord; and how would you constrain such keys? (Triggers?) |
|
| 593 | +* Lazy |
|
| 594 | + |
|
| 595 | +### [33] Why don't *we* use foreign key constraints? (and other constraints) |
|
| 596 | + |
|
| 597 | +* Lazy |
|
| 598 | +* To date, stories have not defined data semantics, only behavioral semantics |
|
| 599 | +* To date, stories have not defined cost of anomalies |
|
| 600 | +* schema.rb |
|
| 601 | + |
|
| 602 | +### [34] pros and cons of deleting rows from the DB (vs. flagging them as 'deleted') |
|
| 603 | + |
|
| 604 | +* Discussion |
|
| 605 | + |
|
| 606 | +### [35] pros and cons of free-text fields |
|
| 607 | + |
|
| 608 | +* Discussion |
|
| 609 | + |
|
| 610 | +### [36] editing data in-place vs. storing new versions with each edit |
|
| 611 | + |
|
| 612 | +* Discussion |
|
| 613 | + |
|
| 614 | +### [37] Recommendations for Further Reading |
|
| 615 | + |
|
| 616 | +1. "SQL Reference" in Enterprise Java in a Nutshell (O'Reilly, 2005) |
|
| 617 | +2. Molinaro, SQL Cookbook (O'Reilly, 2005) |
|
| 618 | +3. Rozenshtein, The Essence of SQL (1997) |
|
| 619 | +4. Celko, SQL for Smarties (2010) (other books can be good; some people like The SQL Puzzle Book) |
|
| 620 | +5. Karwin, SQL Antipatterns (2010) (considers EAV to be an anti-pattern; discusses rounding errors; problems with grouping; much else) |
|
| 621 | +6. Venn diagrams: <http://www.codinghorror.com/blog/2007/10/a-visual-explanation-of-sql-joins.html> |
|
| 622 | + |
|
| 623 | +### [37] Resources to avoid |
|
| 624 | + |
|
| 625 | +0. Elmasri, Fundamentals of Database Systems (6th ed., 2010). Regrettably, this book is a mess and tends to go for the most impenetrable explanation possible. Still, it has stuff on modeling and normalization that is hard to find. |
|
| 626 | +1. Anything on the web; seriously, watch out. People say really stupid stuff about SQL. |
|
| 627 | +2. Any book that is focused on one database. You will be unhappy. |
|
| 628 | + |
|
| 629 | +--- |
|
| 630 | + |
|
| 631 | +### [38] Populating names and books with join table |
|
| 632 | + |
|
| 633 | + drop table names; |
|
| 634 | + create table names ( |
|
| 635 | + id serial, -- "auto increment" |
|
| 636 | + first_name text |
|
| 637 | + ); |
|
| 638 | + drop table books; |
|
| 639 | + create table books ( |
|
| 640 | + id serial, -- "auto increment" |
|
| 641 | + title text |
|
| 642 | + ); |
|
| 643 | + |
|
| 644 | + delete from names; |
|
| 645 | + insert into names (first_name) values ('John'); |
|
| 646 | + insert into names (first_name) values ('James'); |
|
| 647 | + insert into names (first_name) values ('Tom'); |
|
| 648 | + insert into names (first_name) values ('Patrick'); |
|
| 649 | + |
|
| 650 | + delete from books; |
|
| 651 | + insert into books (title) values ('The C Programming Language'); |
|
| 652 | + insert into books (title) values ('The Ruby Programming Language'); |
|
| 653 | + insert into books (title) values ('Java Concurrency in Practice'); |
|
| 654 | + insert into books (title) values ('SQL for Smarties'); |
|
| 655 | + insert into books (title) values ('Specification by Example'); |
|
| 656 | + |
|
| 657 | + drop table if exists names_books; |
|
| 658 | + create table names_books ( |
|
| 659 | + name_id integer, |
|
| 660 | + book_id integer |
|
| 661 | + ); |
|
| 662 | + |
|
| 663 | + insert into names_books values ((select id from names where first_name = 'John'), (select id from books where title = 'The C Programming Language')); |
|
| 664 | + insert into names_books values ((select id from names where first_name = 'John'), (select id from books where title = 'The Ruby Programming Language')); |
|
| 665 | + insert into names_books values ((select id from names where first_name = 'John'), (select id from books where title = 'SQL for Smarties')); |
|
| 666 | + insert into names_books values ((select id from names where first_name = 'James'), (select id from books where title = 'Java Concurrency in Practice')); |
|
| 667 | + insert into names_books values ((select id from names where first_name = 'Tom'), (select id from books where title = 'SQL for Smarties')); |
|
| 668 | + |
|
| 669 | + |
|
| 670 | + select names.first_name, books.title |
|
| 671 | + from names, books, names_books |
|
| 672 | + where names.id = names_books.name_id |
|
| 673 | + and books.id = names_books.book_id; |
|
| 674 | + |
|
| 675 | +## Other topics |
|
| 676 | + |
|
| 677 | +### [39] The meaning of null |
|
| 678 | + |
|
| 679 | +See Elmasri. |
|
| 680 | + |
|
| 681 | +### [40] SQL injection |
|
| 682 | + |
|
| 683 | +### [41] Prepared statements |
|
| 684 | + |
|
| 685 | +### [42] Domain modeling |
|
| 686 | + |
|
| 687 | +See Elmasri. |
|
| 688 | + |
|
| 689 | +### [43] Normalization |
|
| 690 | + |
|
| 691 | +See Elmasri. |
|
| 692 | + |
|
| 693 | +### [44] Star schemas |
|
| 694 | + |
|
| 695 | +### [45] Data warehousing |
|
| 696 | + |
|
| 697 | +### [46] ETL |
|
| 698 | + |