11f8d2f868e150b190ce2d23f16f3417355da229
public/sql.md
... | ... | @@ -0,0 +1,698 @@ |
1 | +# SQL / Relational |
|
2 | + |
|
3 | +<style type="text/css"> |
|
4 | +div .venn { |
|
5 | + float: left; |
|
6 | + width: 300px; |
|
7 | + height: 300px; |
|
8 | + border-radius: 150px; |
|
9 | + border: 3px solid #000; |
|
10 | + color: #000; |
|
11 | + text-align: center; |
|
12 | + vertical-align: middle; |
|
13 | + font-size: 1.0em; |
|
14 | + line-height: 300px; |
|
15 | + font-weight: bold; |
|
16 | + font-family: sans-serif; |
|
17 | + position: relative; |
|
18 | +} |
|
19 | +.inner_one { background-color: yellow; opacity: 0.5; } |
|
20 | +.inner_two::before{ content: 'id = name_id'; position: absolute; left: 10px } |
|
21 | +.inner_two { background-color: red; opacity: 0.5; margin-left: -110px; } |
|
22 | + |
|
23 | +.left_one { background-color: #ff8040; opacity: 0.5; z-index: 1; } |
|
24 | +.left_two { background-color: red; opacity: 0.5; margin-left: -110px; } |
|
25 | +</style> |
|
26 | + |
|
27 | +### [1] Why we (I) love relational databases |
|
28 | + |
|
29 | +* **ACID** |
|
30 | + * Atomicity (transactions [multiple operations can be "all or nothing"] - even with externalities such as power loss) |
|
31 | + * Consistency (what's on disk is always acceptable as a state - including constraints, etc.) |
|
32 | + * Isolation (no transaction affects another) |
|
33 | + * Durability (if an insert completes, it is really written to disk) |
|
34 | +* Do we need these? |
|
35 | +* What apps don't need these things? |
|
36 | + * If Facebook fails to provide you with 1 in 100,000 status updates from your friends, what is the cost? |
|
37 | +* Can we get these properties from a database distributed across a network? (CAP Theorem: See <http://www.youtube.com/watch?v=Jw1iFr4v58M> and <http://blog.beany.co.kr/wp-content/uploads/2011/03/nosql_cap.png>) |
|
38 | + |
|
39 | +### [2] Creating a database and connecting (from shell) |
|
40 | + |
|
41 | + dropdb example |
|
42 | + createdb example |
|
43 | + psql example |
|
44 | + |
|
45 | +### [3] Dropping and creating a table |
|
46 | + |
|
47 | + drop table names; |
|
48 | + create table names ( |
|
49 | + id serial, -- "auto increment" |
|
50 | + first_name text |
|
51 | + ); |
|
52 | + |
|
53 | +### [4] Another |
|
54 | + |
|
55 | + drop table books; |
|
56 | + create table books ( |
|
57 | + id serial, -- "auto increment" |
|
58 | + name_id int, |
|
59 | + title text |
|
60 | + ); |
|
61 | + |
|
62 | +### [5] Adding data |
|
63 | + |
|
64 | + delete from names; |
|
65 | + insert into names (first_name) values ('John'); |
|
66 | + insert into names (first_name) values ('James'); |
|
67 | + insert into names (first_name) values ('Tom'); |
|
68 | + |
|
69 | +* **Tip:** Always give the column names so that you are not dependent on the created order |
|
70 | + |
|
71 | +### [6] more |
|
72 | + |
|
73 | + delete from books; |
|
74 | + insert into books (name_id, title) values (1, 'The C Programming Language'); |
|
75 | + insert into books (name_id, title) values (1, 'The Ruby Programming Language'); |
|
76 | + insert into books (name_id, title) values (2, 'Java Concurrency in Practice'); |
|
77 | + insert into books (name_id, title) values (null, 'SQL for Smarties'); |
|
78 | + -- Notice: We are assuming we know the values for names.id |
|
79 | + -- What if we re-ran the inserts for names? |
|
80 | + |
|
81 | +### [7] query - wildcard for columns |
|
82 | + |
|
83 | + select * from names; |
|
84 | + select * from books; |
|
85 | + |
|
86 | +### [8] query - rearranging columns and specifying order |
|
87 | + |
|
88 | + select first_name, id from names; |
|
89 | + select name_id, id, title from books; |
|
90 | + |
|
91 | +### [9] cross join - return "product" (all combinations) of two tables |
|
92 | + |
|
93 | + select * |
|
94 | + from names, books; |
|
95 | + -- don't do this with big tables |
|
96 | + |
|
97 | +### [10] same thing, but putting ids next together |
|
98 | + |
|
99 | + select names.first_name, names.id as "names.id", |
|
100 | + books.name_id as "books.name_id", books.id as "books.id", books.title |
|
101 | + from names, books |
|
102 | + order by names.id; |
|
103 | + -- don't do this with big tables |
|
104 | + |
|
105 | +### [11] inner join - Venn diagram |
|
106 | + |
|
107 | +--- |
|
108 | + |
|
109 | +<b>Inner join: What we want is represented by the <span style="background-color: #ff8040;">orange</span> section</b> |
|
110 | +<br/> |
|
111 | + |
|
112 | +<div class="venn inner_one">Names</div> |
|
113 | +<div class="venn inner_two">Books</div> |
|
114 | +<div style="clear: both;"/> |
|
115 | + |
|
116 | +<div> |
|
117 | +<div style="float: left;"> |
|
118 | +<b>names</b> |
|
119 | +<table> |
|
120 | + <tr><th>first_name</th><th>id</th></tr> |
|
121 | + <tr><td>John</td><td style="background-color: yellow; opacity: 0.5;">1</td></tr> |
|
122 | + <tr><td>James</td><td style="background-color: yellow; opacity: 0.5;">2</td></tr> |
|
123 | + <tr><td>Tom</td><td style="background-color: yellow; opacity: 0.5;">3</td></tr> |
|
124 | +</table> |
|
125 | +</div> |
|
126 | + |
|
127 | +<div style="float: right;"> |
|
128 | +<b>books</b> |
|
129 | +<table> |
|
130 | + <tr><th>name_id</th><th>id</th><th>title</th></tr> |
|
131 | + <tr><td style="background-color: yellow; opacity: 0.5;">1</td><td style="background-color: red; opacity: 0.5;">1</td><td>The C Programming Language</td></tr> |
|
132 | + <tr><td style="background-color: yellow; opacity: 0.5;">1</td><td style="background-color: red; opacity: 0.5;">2</td><td>The Ruby Programming Language</td></tr> |
|
133 | + <tr><td style="background-color: yellow; opacity: 0.5;">2</td><td style="background-color: red; opacity: 0.5;">3</td><td>Java Concurrency in Practice</td></tr> |
|
134 | + <tr><td>(null)</td><td style="background-color: red; opacity: 0.5;">4</td><td>SQL for Smarties</td></tr> |
|
135 | +</table> |
|
136 | +</div> |
|
137 | +</div> |
|
138 | +<div style="clear: both;"/> |
|
139 | +<br/> |
|
140 | + |
|
141 | +--- |
|
142 | + |
|
143 | +### [12] inner join - SQL - "Give me data from rows from both tables where the condition is true" |
|
144 | + |
|
145 | + -- "explicit join notation" |
|
146 | + select first_name, title |
|
147 | + from books join names |
|
148 | + on names.id = books.name_id; |
|
149 | + |
|
150 | + -- "implicit join notation" |
|
151 | + select first_name, title |
|
152 | + from books, names |
|
153 | + where names.id = books.name_id; |
|
154 | + |
|
155 | + -- Same as "product" join above, but using where clause |
|
156 | + select names.first_name, names.id as "names.id", |
|
157 | + books.name_id as "books.name_id", books.id as "books.id", books.title |
|
158 | + from names, books |
|
159 | + where names.id = books.name_id |
|
160 | + order by names.id; |
|
161 | + |
|
162 | +### [13] left join - Venn diagram |
|
163 | + |
|
164 | +--- |
|
165 | + |
|
166 | +<b>Left join: What we want is represented by the <span style="background-color: #ffc0a0;">salmon</span> and <span style="background-color: #ff8040;">orange</span> sections</b> |
|
167 | +<br/> |
|
168 | + |
|
169 | +<div class="venn left_one">Names</div> |
|
170 | +<div class="venn left_two">Books</div> |
|
171 | +<div style="clear: both;"/> |
|
172 | + |
|
173 | +<div> |
|
174 | +<div style="float: left;"> |
|
175 | +<b>names</b> |
|
176 | +<table> |
|
177 | + <tr><th>first_name</th><th>id</th></tr> |
|
178 | + <tr><td>John</td><td style="background-color: #ffc0a0;">1</td></tr> |
|
179 | + <tr><td>James</td><td style="background-color: #ffc0a0;">2</td></tr> |
|
180 | + <tr><td>Tom</td><td style="background-color: #ffc0a0;">3</td></tr> |
|
181 | +</table> |
|
182 | +</div> |
|
183 | + |
|
184 | +<div style="float: right;"> |
|
185 | +<b>books</b> |
|
186 | +<table> |
|
187 | + <tr><th>name_id</th><th>id</th><th>title</th></tr> |
|
188 | + <tr><td style="background-color: #ffc0a0;">1</td><td style="background-color: red; opacity: 0.5;">1</td><td>The C Programming Language</td></tr> |
|
189 | + <tr><td style="background-color: #ffc0a0;">1</td><td style="background-color: red; opacity: 0.5;">2</td><td>The Ruby Programming Language</td></tr> |
|
190 | + <tr><td style="background-color: #ffc0a0;">2</td><td style="background-color: red; opacity: 0.5;">3</td><td>Java Concurrency in Practice</td></tr> |
|
191 | + <tr><td>(null)</td><td style="background-color: red; opacity: 0.5;">4</td><td>SQL for Smarties</td></tr> |
|
192 | +</table> |
|
193 | +</div> |
|
194 | +</div> |
|
195 | +<div style="clear: both;"/> |
|
196 | +<br/> |
|
197 | + |
|
198 | +--- |
|
199 | + |
|
200 | +### [14] left join - SQL - "Give me data for 'left' table, and data the 'right' table when the condition is true, but nulls for the 'right' table when the condition is false" |
|
201 | + |
|
202 | + select names.first_name, books.title |
|
203 | + from names left join books |
|
204 | + on books.name_id = names.id; |
|
205 | + |
|
206 | + first_name | title |
|
207 | + ------------+------------------------------- |
|
208 | + John | The C Programming Language |
|
209 | + John | The Ruby Programming Language |
|
210 | + James | Java Concurrency in Practice |
|
211 | + Tom | |
|
212 | + (4 rows) |
|
213 | + |
|
214 | +### [15] left join is dominate pattern in ActiveRecord . . . |
|
215 | + |
|
216 | +* why? |
|
217 | + |
|
218 | +### [16] left join is dominate pattern in ActiveRecord |
|
219 | + |
|
220 | +* when we send an association message to a relation, we expect that association to be applied to each element |
|
221 | +* Note, however, that ActiveRecord avoids real left joins |
|
222 | + |
|
223 | +### [17] Now that you're so smart |
|
224 | + |
|
225 | +* How many rows will each of the following queries return? |
|
226 | + |
|
227 | + select * from names left join books on books.name_id = names.id; |
|
228 | + select * from names right join books on books.name_id = names.id; |
|
229 | + select * from books left join names on books.name_id = names.id; |
|
230 | + select * from books right join names on books.name_id = names.id; |
|
231 | + |
|
232 | +### [18] demo - study the log |
|
233 | + |
|
234 | + irb |
|
235 | + require 'active_record' |
|
236 | + require 'pg' |
|
237 | + require 'logger' |
|
238 | + B = ActiveRecord::Base |
|
239 | + B.logger = Logger.new(STDOUT) |
|
240 | + M = ActiveRecord::Migration |
|
241 | + |
|
242 | + spec = { adapter: :postgresql, database: :ex } |
|
243 | + B.establish_connection(adapter: :postgresql, database: :postgres, schema_search_path: :public) |
|
244 | + B.connection.drop_database spec[:database] |
|
245 | + B.connection.create_database spec[:database] |
|
246 | + B.establish_connection spec |
|
247 | + |
|
248 | + B.mass_assignment_sanitizer = :logger |
|
249 | + CN = Class.new(M) { def change; create_table :names do |t| t.text :first_name end; end } |
|
250 | + CB = Class.new(M) { def change; create_table :books do |t| t.text :title; t.references :name; end; end } |
|
251 | + CN.migrate(:up) |
|
252 | + CB.migrate(:up) |
|
253 | + Name = Class.new(B) { has_many :books } |
|
254 | + Book = Class.new(B) { belongs_to :name } |
|
255 | + |
|
256 | + john = Name.create! first_name: 'John' |
|
257 | + james = Name.create! first_name: 'James' |
|
258 | + tom = Name.create! first_name: 'Tom' |
|
259 | + c = Book.create! name: john, title: 'The C Programming Language' |
|
260 | + ruby = Book.create! name: john, title: 'The Ruby Programming Language' |
|
261 | + java = Book.create! name: james, title: 'Java Concurrency in Practice' |
|
262 | + |
|
263 | + # Let's discuss what we see in the log |
|
264 | + |
|
265 | + Name.all(include: :books) |
|
266 | + Name.all(joins: :books) |
|
267 | + Name.all(joins: 'left join books on books.name_id = names.id') |
|
268 | + Name.all(joins: 'left join books on books.name_id = names.id', include: :books) |
|
269 | + |
|
270 | +### [19] obviously not the way we would represent book ownership for realsies |
|
271 | + |
|
272 | +* We would use a join table in between names and books - Now a book can be owned by more than one person (but notice: this is arguably the wrong modeling) |
|
273 | + |
|
274 | +<div> |
|
275 | +<div style="float: left;"> |
|
276 | +<b>names</b> |
|
277 | +<table> |
|
278 | + <tr><th>first_name</th><th>id</th></tr> |
|
279 | + <tr><td>John</td><td style="background-color: #ffc0a0;">1</td></tr> |
|
280 | + <tr><td>James</td><td style="background-color: #ffc0a0;">2</td></tr> |
|
281 | + <tr><td>Tom</td><td style="background-color: #ffc0a0;">3</td></tr> |
|
282 | + <tr><td>Patrick</td><td style="background-color: #ffc0a0;">4</td></tr> |
|
283 | +</table> |
|
284 | +</div> |
|
285 | + |
|
286 | +<div style="float: left; padding-left: 100px;"> |
|
287 | + <b>names_books</b> |
|
288 | + <table> |
|
289 | + <tr><th>name_id</th><th>book_id</th></tr> |
|
290 | + <tr><td style="background-color: #ffc0a0;">1</td><td style="background-color: red; opacity: 0.5;">1</td></tr> |
|
291 | + <tr><td style="background-color: #ffc0a0;">1</td><td style="background-color: red; opacity: 0.5;">2</td></tr> |
|
292 | + <tr><td style="background-color: #ffc0a0;">1</td><td style="background-color: red; opacity: 0.5;">4</td></tr> |
|
293 | + <tr><td style="background-color: #ffc0a0;">2</td><td style="background-color: red; opacity: 0.5;">3</td></tr> |
|
294 | + <tr><td style="background-color: #ffc0a0;">3</td><td style="background-color: red; opacity: 0.5;">4</td></tr> |
|
295 | + </table> |
|
296 | +</div> |
|
297 | + |
|
298 | +<div style="float: right;"> |
|
299 | +<b>books</b> |
|
300 | +<table> |
|
301 | + <tr><th>id</th><th>title</th></tr> |
|
302 | + <tr><td style="background-color: red; opacity: 0.5;">1</td><td>The C Programming Language</td></tr> |
|
303 | + <tr><td style="background-color: red; opacity: 0.5;">2</td><td>The Ruby Programming Language</td></tr> |
|
304 | + <tr><td style="background-color: red; opacity: 0.5;">3</td><td>Java Concurrency in Practice</td></tr> |
|
305 | + <tr><td style="background-color: red; opacity: 0.5;">4</td><td>SQL for Smarties</td></tr> |
|
306 | + <tr><td style="background-color: red; opacity: 0.5;">5</td><td>Specification by Example</td></tr> |
|
307 | +</table> |
|
308 | +</div> |
|
309 | +</div> |
|
310 | +<div style="clear: both;"/> |
|
311 | +<br/> |
|
312 | + |
|
313 | +* Names and ownership |
|
314 | + |
|
315 | + select names.first_name, books.title |
|
316 | + from names, books, names_books |
|
317 | + where names.id = names_books.name_id |
|
318 | + and books.id = names_books.book_id; |
|
319 | + |
|
320 | +### [20] explain plan |
|
321 | + |
|
322 | +* table with slow query |
|
323 | + |
|
324 | + drop table example; |
|
325 | + create table example |
|
326 | + as select generate_series(1, 1000000) as id, |
|
327 | + 'some message'::varchar(100) as message; |
|
328 | + insert into example values (1000001, 'different message'); |
|
329 | + select * from example where message = 'different message'; -- 250ms |
|
330 | + |
|
331 | +* see the query plan |
|
332 | + |
|
333 | + explain select * from example where message = 'different message'; |
|
334 | + |
|
335 | +* add an index, re-run query, and check plan again |
|
336 | + |
|
337 | + create index on example (message); |
|
338 | + select * from example where message = 'different message'; --16ms |
|
339 | + explain select * from example where message = 'different message'; |
|
340 | + |
|
341 | +### [21] Aggregation and group by, having, etc. |
|
342 | + |
|
343 | +* `group by` provides for a means of showing the distinct values for a column |
|
344 | + |
|
345 | + select message from example group by message; |
|
346 | + -- compare: |
|
347 | + -- select distinct message from example; |
|
348 | + |
|
349 | +* But even better, we can aggregate according to the groups |
|
350 | + |
|
351 | + select message, count(*) from example group by message; |
|
352 | + |
|
353 | +* But even mo' better, we can as for groups `having` certain characteristics |
|
354 | + |
|
355 | + select message, count(*) from example group by message having count(*) = 1; |
|
356 | + |
|
357 | +### [22] Subquery expressions |
|
358 | + |
|
359 | +* You want a predicate that checks another table |
|
360 | +* Example: find all patients with an appointment |
|
361 | + |
|
362 | + select count(*) from patients |
|
363 | + where exists ( |
|
364 | + select 1 |
|
365 | + from appointment_storages |
|
366 | + where appointment_storages.patient_id = patients.id |
|
367 | + ); |
|
368 | + |
|
369 | +* Other subquery expressions: `in`, `not in`, `any`; see docs (<http://www.postgresql.org/docs/9.2/static/functions-subquery.html>) |
|
370 | +* Why? Expressive. Compare this wrong query trying to do the same thing |
|
371 | + |
|
372 | + select count(*) |
|
373 | + from patients, appointment_storages |
|
374 | + where appointment_storages.patient_id = patients.id; |
|
375 | + |
|
376 | +* And compare this correct query trying to do the same thing |
|
377 | + |
|
378 | + select count(distinct patients.id) |
|
379 | + from patients, appointment_storages |
|
380 | + where appointment_storages.patient_id = patients.id; |
|
381 | + |
|
382 | +### [23] More about explain |
|
383 | + |
|
384 | +* Can see cost estimates; first value is startup, second is doing the work |
|
385 | + |
|
386 | + explain select count(*) from patients |
|
387 | + where exists ( |
|
388 | + select 1 |
|
389 | + from appointment_storages |
|
390 | + where appointment_storages.patient_id = patients.id |
|
391 | + ); |
|
392 | + |
|
393 | +* Can also run (with `analyze`) to see actuals vs. estimates |
|
394 | + |
|
395 | + explain analyze select count(*) from patients |
|
396 | + where exists ( |
|
397 | + select 1 |
|
398 | + from appointment_storages |
|
399 | + where appointment_storages.patient_id = patients.id |
|
400 | + ); |
|
401 | + |
|
402 | +* Hey, why is explain telling me it's going to do a sequential scan even though I have an index? |
|
403 | +* pgadmin has a visual explainer (demo) |
|
404 | +* Recommended: <http://wiki.postgresql.org/images/4/45/Explaining_EXPLAIN.pdf> |
|
405 | +* Want more? See _PostgreSQL 9.0 High Performance_ (in Engineering library) |
|
406 | + |
|
407 | +### [24] self-joins |
|
408 | + |
|
409 | +* The classic problem: Find all instances of a staff member creating a task before another is completed. |
|
410 | +* Always explore your data first **<-- best practice** |
|
411 | +* First find a user with not too many tasks |
|
412 | + |
|
413 | + select users.id, users.email, practices.name, count(tasks.*) |
|
414 | + from users, tasks, practices |
|
415 | + where tasks.created_by_id = users.id |
|
416 | + and practices.id = users.practice_id |
|
417 | + group by users.id, users.email, practices.name |
|
418 | + having count(tasks.*) < 50 |
|
419 | + order by count(tasks.*) desc; |
|
420 | + |
|
421 | +* Look at tasks for such a user to get a feel for the data |
|
422 | + |
|
423 | + select tasks.title, tasks.created_at, tasks.completed_at |
|
424 | + from tasks, users |
|
425 | + where tasks.created_by_id = users.id |
|
426 | + and users.id = 66 |
|
427 | + order by created_at asc; |
|
428 | + |
|
429 | +* now join tasks with tasks |
|
430 | + |
|
431 | + select a.title || ' (' || a.id || ') overlaps ' || b.title || ' (' || b.id || ')' |
|
432 | + from tasks a, tasks b |
|
433 | + where a.created_by_id = b.created_by_id |
|
434 | + and b.created_at >= a.created_at |
|
435 | + and b.created_at <= a.completed_at |
|
436 | + and a.id != b.id |
|
437 | + and a.created_by_id = 66 |
|
438 | + and b.created_by_id = 66 |
|
439 | + order by a.id asc; |
|
440 | + |
|
441 | +* inspect some examples |
|
442 | + |
|
443 | + select * from tasks where id in (17359, 17401, 17402, 20457); |
|
444 | + |
|
445 | +### [25] Another: Find the first weight measurement (or null) for every patient |
|
446 | + |
|
447 | +* Let's get the earliest measured_at for each patient/marker combination |
|
448 | + |
|
449 | + select patient_id, marker_cname, min(measured_at) as first_measured_at |
|
450 | + from measurements |
|
451 | + group by patient_id, marker_cname; |
|
452 | + |
|
453 | +* We are using min() here. Could there be two measurements for the same patient and marker with the same measured_at? |
|
454 | + |
|
455 | + select m.patient_id, m.marker_cname, m.measured_at, count(*) |
|
456 | + from ( |
|
457 | + select patient_id, marker_cname, min(measured_at) as first_measured_at |
|
458 | + from measurements |
|
459 | + group by patient_id, marker_cname) firsts, |
|
460 | + measurements m |
|
461 | + where firsts.patient_id = m.patient_id |
|
462 | + and firsts.marker_cname = m.marker_cname |
|
463 | + and firsts.first_measured_at = m.measured_at |
|
464 | + group by m.patient_id, m.marker_cname, m.measured_at |
|
465 | + having count(*) > 1; |
|
466 | + |
|
467 | +* Yes. Well that sucks. That might be a bug or deserve a constraint. What we will do is get the first id for the earliest measured_at for each patient/marker combination. |
|
468 | + |
|
469 | + select m.patient_id, m.marker_cname, min(id) as id |
|
470 | + from ( |
|
471 | + select patient_id, marker_cname, min(measured_at) as first_measured_at |
|
472 | + from measurements |
|
473 | + group by patient_id, marker_cname) firsts, |
|
474 | + measurements m |
|
475 | + where firsts.patient_id = m.patient_id |
|
476 | + and firsts.marker_cname = m.marker_cname |
|
477 | + and firsts.first_measured_at = m.measured_at |
|
478 | + group by m.patient_id, m.marker_cname; |
|
479 | + |
|
480 | +* Now this is a very useful set of rows. We can do left joins from the patients table into this and select for various markers; and then join from this table into measurements. We can create a temporary table and direct PG to drop it after a commit. Then we can use and re-use this temporary table in |
|
481 | +a series of left joins. |
|
482 | + |
|
483 | + create temporary table firsts on commit drop as |
|
484 | + select m.patient_id, m.marker_cname, min(id) as id |
|
485 | + from ( |
|
486 | + select patient_id, marker_cname, min(measured_at) as first_measured_at |
|
487 | + from measurements |
|
488 | + group by patient_id, marker_cname) firsts, |
|
489 | + measurements m |
|
490 | + where firsts.patient_id = m.patient_id |
|
491 | + and firsts.marker_cname = m.marker_cname |
|
492 | + and firsts.first_measured_at = m.measured_at |
|
493 | + group by m.patient_id, m.marker_cname; |
|
494 | + |
|
495 | + select p.id, weight, weight_first_measured_at, height, height_first_measured_at, ldl, ldl_first_measured_at |
|
496 | + from patients p |
|
497 | + left join ( |
|
498 | + select f.patient_id, m.value as weight, m.measured_at as weight_first_measured_at |
|
499 | + from firsts f, measurements m where f.marker_cname = 'weight' and m.id = f.id) weights |
|
500 | + on weights.patient_id = p.id |
|
501 | + left join ( |
|
502 | + select f.patient_id, m.value as height, m.measured_at as height_first_measured_at |
|
503 | + from firsts f, measurements m where f.marker_cname = 'height' and m.id = f.id) heights |
|
504 | + on heights.patient_id = p.id |
|
505 | + left join ( |
|
506 | + select f.patient_id, m.value as ldl, m.measured_at as ldl_first_measured_at |
|
507 | + from firsts f, measurements m where f.marker_cname = 'ldl' and m.id = f.id) ldls |
|
508 | + on ldls.patient_id = p.id |
|
509 | + ; |
|
510 | + |
|
511 | +### [26] PG "distinct on" |
|
512 | + |
|
513 | +* When you want the first row in a group, you can use PG's "distinct on" clause (<http://www.postgresql.org/docs/9.2/static/sql-select.html#SQL-DISTINCT>) |
|
514 | +* But watch out, this can return surprising results |
|
515 | +* People commonly use it to mean: "Give me the 'top' something in this group" -- but what if there is a tie? Example |
|
516 | + |
|
517 | + drop table if exists employees; |
|
518 | + create table employees ( |
|
519 | + name varchar(100), |
|
520 | + department integer, |
|
521 | + salary integer); |
|
522 | + insert into employees (name, department, salary) values ('John', 1, 20000); |
|
523 | + insert into employees (name, department, salary) values ('Joe', 1, 20000); |
|
524 | + insert into employees (name, department, salary) values ('Jill', 2, 40000); |
|
525 | + |
|
526 | + select department, max(salary) as max_salary |
|
527 | + from employees |
|
528 | + group by department; |
|
529 | + |
|
530 | + select e.department, e.name, e.salary |
|
531 | + from employees e, |
|
532 | + (select department, max(salary) as max_salary |
|
533 | + from employees |
|
534 | + group by department) maxs |
|
535 | + where e.department = maxs.department |
|
536 | + and e.salary = maxs.max_salary; |
|
537 | + |
|
538 | + select distinct on (department) * |
|
539 | + from employees |
|
540 | + order by department, salary desc; |
|
541 | + |
|
542 | +* Advice: Learn your SQL according to the standard (see Appendix B in _SQL Cookbook_) |
|
543 | + |
|
544 | +### [27] What is a PG database? |
|
545 | + |
|
546 | +* A PG database is that to which you connect |
|
547 | +* Got it? You can only query a database to which you are connected |
|
548 | +* A database contains schemas which contain tables |
|
549 | +* We don't deal with schemas because our tables are in the default schema (`public`) |
|
550 | + |
|
551 | +### [28] What is a PG schema? |
|
552 | + |
|
553 | +* Think of a PostgreSQL schema as a prefix to a table name (public.patients) |
|
554 | +* A database can have tables in different schemas |
|
555 | +* PG will match unprefixed table names according to a search path |
|
556 | +* Why do we like it? |
|
557 | + * Tables in a schema can be dropped together |
|
558 | + * Tables in a schema can be authorized separately |
|
559 | + * Tables can have the same name if in different schemas |
|
560 | + |
|
561 | +### [29] What is a PG tablespace? |
|
562 | + |
|
563 | +* A PG tablespace is a location in the file system where tables (etc.) go |
|
564 | +* Put frequently-queried tables on a fast disk |
|
565 | +* Make it easier for backups |
|
566 | +* NOTE: In Master/Slave, must have the identical tablespace (location) on both machines |
|
567 | + |
|
568 | +### [30] When should I add an index? |
|
569 | + |
|
570 | +* When your query is slow |
|
571 | +* What is slow? |
|
572 | + * Look at the logs and compare to other queries |
|
573 | +* Will an index help me? |
|
574 | + * Try `explain analyze` on your query |
|
575 | + * Add an index |
|
576 | + * Rerun `explain analyze` |
|
577 | +* Don't waste your time on this exercise on your development machine (PG will for instance just use sequential access for small data sets, 'cos it's fast); use something closer to production |
|
578 | + |
|
579 | +### [31] What is a constraint? |
|
580 | + |
|
581 | +* Check constraint: Disallow insertion of a row where a column value violates a rule (e.g., height > 144 inches) |
|
582 | +* Not null constraint: Disallow insertion of a row where a column value is null |
|
583 | +* Foreign key constraint: Disallow insertion of a row with foreign key value not found as a primary key in the other table |
|
584 | + |
|
585 | +### [32] Why don't Rails developers use foreign key constraints? (and other constraints) |
|
586 | + |
|
587 | +* History: Very difficult to create an ORM that can define constraint types in a database-agnostic way |
|
588 | + * Surely you jest? |
|
589 | +* History: Lack of clarity around tier responsibility: E.g., who should validate? |
|
590 | +* History: Inexperience with transactions |
|
591 | +* History: Probably not important in Basecamp (except for billing) |
|
592 | +* History: There was a time when people liked polymorphic associations in ActiveRecord; and how would you constrain such keys? (Triggers?) |
|
593 | +* Lazy |
|
594 | + |
|
595 | +### [33] Why don't *we* use foreign key constraints? (and other constraints) |
|
596 | + |
|
597 | +* Lazy |
|
598 | +* To date, stories have not defined data semantics, only behavioral semantics |
|
599 | +* To date, stories have not defined cost of anomalies |
|
600 | +* schema.rb |
|
601 | + |
|
602 | +### [34] pros and cons of deleting rows from the DB (vs. flagging them as 'deleted') |
|
603 | + |
|
604 | +* Discussion |
|
605 | + |
|
606 | +### [35] pros and cons of free-text fields |
|
607 | + |
|
608 | +* Discussion |
|
609 | + |
|
610 | +### [36] editing data in-place vs. storing new versions with each edit |
|
611 | + |
|
612 | +* Discussion |
|
613 | + |
|
614 | +### [37] Recommendations for Further Reading |
|
615 | + |
|
616 | +1. "SQL Reference" in Enterprise Java in a Nutshell (O'Reilly, 2005) |
|
617 | +2. Molinaro, SQL Cookbook (O'Reilly, 2005) |
|
618 | +3. Rozenshtein, The Essence of SQL (1997) |
|
619 | +4. Celko, SQL for Smarties (2010) (other books can be good; some people like The SQL Puzzle Book) |
|
620 | +5. Karwin, SQL Antipatterns (2010) (considers EAV to be an anti-pattern; discusses rounding errors; problems with grouping; much else) |
|
621 | +6. Venn diagrams: <http://www.codinghorror.com/blog/2007/10/a-visual-explanation-of-sql-joins.html> |
|
622 | + |
|
623 | +### [37] Resources to avoid |
|
624 | + |
|
625 | +0. Elmasri, Fundamentals of Database Systems (6th ed., 2010). Regrettably, this book is a mess and tends to go for the most impenetrable explanation possible. Still, it has stuff on modeling and normalization that is hard to find. |
|
626 | +1. Anything on the web; seriously, watch out. People say really stupid stuff about SQL. |
|
627 | +2. Any book that is focused on one database. You will be unhappy. |
|
628 | + |
|
629 | +--- |
|
630 | + |
|
631 | +### [38] Populating names and books with join table |
|
632 | + |
|
633 | + drop table names; |
|
634 | + create table names ( |
|
635 | + id serial, -- "auto increment" |
|
636 | + first_name text |
|
637 | + ); |
|
638 | + drop table books; |
|
639 | + create table books ( |
|
640 | + id serial, -- "auto increment" |
|
641 | + title text |
|
642 | + ); |
|
643 | + |
|
644 | + delete from names; |
|
645 | + insert into names (first_name) values ('John'); |
|
646 | + insert into names (first_name) values ('James'); |
|
647 | + insert into names (first_name) values ('Tom'); |
|
648 | + insert into names (first_name) values ('Patrick'); |
|
649 | + |
|
650 | + delete from books; |
|
651 | + insert into books (title) values ('The C Programming Language'); |
|
652 | + insert into books (title) values ('The Ruby Programming Language'); |
|
653 | + insert into books (title) values ('Java Concurrency in Practice'); |
|
654 | + insert into books (title) values ('SQL for Smarties'); |
|
655 | + insert into books (title) values ('Specification by Example'); |
|
656 | + |
|
657 | + drop table if exists names_books; |
|
658 | + create table names_books ( |
|
659 | + name_id integer, |
|
660 | + book_id integer |
|
661 | + ); |
|
662 | + |
|
663 | + insert into names_books values ((select id from names where first_name = 'John'), (select id from books where title = 'The C Programming Language')); |
|
664 | + insert into names_books values ((select id from names where first_name = 'John'), (select id from books where title = 'The Ruby Programming Language')); |
|
665 | + insert into names_books values ((select id from names where first_name = 'John'), (select id from books where title = 'SQL for Smarties')); |
|
666 | + insert into names_books values ((select id from names where first_name = 'James'), (select id from books where title = 'Java Concurrency in Practice')); |
|
667 | + insert into names_books values ((select id from names where first_name = 'Tom'), (select id from books where title = 'SQL for Smarties')); |
|
668 | + |
|
669 | + |
|
670 | + select names.first_name, books.title |
|
671 | + from names, books, names_books |
|
672 | + where names.id = names_books.name_id |
|
673 | + and books.id = names_books.book_id; |
|
674 | + |
|
675 | +## Other topics |
|
676 | + |
|
677 | +### [39] The meaning of null |
|
678 | + |
|
679 | +See Elmasri. |
|
680 | + |
|
681 | +### [40] SQL injection |
|
682 | + |
|
683 | +### [41] Prepared statements |
|
684 | + |
|
685 | +### [42] Domain modeling |
|
686 | + |
|
687 | +See Elmasri. |
|
688 | + |
|
689 | +### [43] Normalization |
|
690 | + |
|
691 | +See Elmasri. |
|
692 | + |
|
693 | +### [44] Star schemas |
|
694 | + |
|
695 | +### [45] Data warehousing |
|
696 | + |
|
697 | +### [46] ETL |
|
698 | + |