forked from HTTPArchive/legacy.httparchive.org
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdbapi.inc
1213 lines (1059 loc) · 46.8 KB
/
dbapi.inc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<?php
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
require_once("utils.inc");
require_once("settings.inc");
require_once("crawls.inc");
// default table names
$gPagesTable = "pages";
$gRequestsTable = "requests";
$gStatusTable = "status";
$gUrlsTable = "urls";
$gUrlsChangeTable = "urlschange";
$gStatsTable = "stats";
$gCrawlsTable = "crawls";
$gSettingsTable = "settings"; // for storing misc settings persistently
$gCdfTable = "cdf";
// Desktop tables
$gPagesTableDesktop = $gPagesTable;
$gRequestsTableDesktop = $gRequestsTable;
$gStatusTableDesktop = $gStatusTable;
$gUrlsTableDesktop = "urls";
$gStatsTableDesktop = $gStatsTable;
$gUrlsChangeTableDesktop = $gUrlsChangeTable;
// Mobile tables
$gPagesTableMobile = $gPagesTable . "mobile";
$gRequestsTableMobile = $gRequestsTable . "mobile";
$gUrlsTableMobile = "urlsmobile";
$gStatusTableMobile = $gStatusTable . "mobile";
$gStatsTableMobile = $gStatsTable; // share the data table - a first step toward a single DB
// Chrome tables
$gPagesTableChrome = $gPagesTable . "chrome";
$gRequestsTableChrome = $gRequestsTable . "chrome";
$gUrlsTableChrome = "urls";
$gStatusTableChrome = $gStatusTable . "chrome";
$gStatsTableChrome = $gStatsTable; // share the data table - a first step toward a single DB
// Android tables
$gPagesTableAndroid = $gPagesTable . "android";
$gRequestsTableAndroid = $gRequestsTable . "android";
$gUrlsTableAndroid = "urls";
$gStatusTableAndroid = $gStatusTable . "android";
$gStatsTableAndroid = $gStatsTable; // share the data table - a first step toward a single DB
// Iphone tables
$gPagesTableIphone = $gPagesTable . "iphone";
$gRequestsTableIphone = $gRequestsTable . "iphone";
$gUrlsTableIphone = "urls";
$gStatusTableIphone = $gStatusTable . "iphone";
$gStatsTableIphone = $gStatsTable; // share the data table - a first step toward a single DB
// IE tables
$gPagesTableIe = $gPagesTable . "ie";
$gRequestsTableIe = $gRequestsTable . "ie";
$gUrlsTableIe = "urls";
$gStatusTableIe = $gStatusTable . "ie";
$gStatsTableIe = $gStatsTable; // share the data table - a first step toward a single DB
// Dev tables
$gPagesTableDev = $gPagesTable . "dev";
$gRequestsTableDev = $gRequestsTable . "dev";
$gStatusTableDev = $gStatusTable . "dev";
$gStatsTableDev = $gStatsTable . "dev";
$gUrlsTableDev = $gUrlsTable . "dev";
$gUrlsChangeTableDev = $gUrlsChangeTable; // . "dev";
//
// HERE'S WHERE WE CHANGE THE DEFAULT TABLE NAMES
// DEPENDING ON WHETHER WE'RE DEV OR MOBILE
//
if ( $gbDev ) {
// Use a dev version of the database tables if "dev/" is in the path.
$gPagesTable = $gPagesTableDev;
$gRequestsTable = $gRequestsTableDev;
$gStatusTable = $gStatusTableDev;
$gStatsTable = $gStatsTableDev;
$gUrlsTable = $gUrlsTableDev;
$gUrlsChangeTable = $gUrlsChangeTableDev;
}
else if ( $gbMobile ) {
// Use a mobile version of the database tables if "mobile" is in the path.
$gPagesTable = $gPagesTableMobile;
$gRequestsTable = $gRequestsTableMobile;
$gStatusTable = $gStatusTableMobile;
$gStatsTable = $gStatsTableMobile;
$gUrlsTable = $gUrlsTableMobile;
}
else if ( $gbChrome ) {
// Use a chrome version of the database tables if "chrome" is in the path.
$gPagesTable = $gPagesTableChrome;
$gRequestsTable = $gRequestsTableChrome;
$gStatusTable = $gStatusTableChrome;
$gStatsTable = $gStatsTableChrome;
}
else if ( $gbAndroid ) {
// Use a android version of the database tables if "android" is in the path.
$gPagesTable = $gPagesTableAndroid;
$gRequestsTable = $gRequestsTableAndroid;
$gStatusTable = $gStatusTableAndroid;
$gStatsTable = $gStatsTableAndroid;
}
else if ( $gbIphone ) {
// Use a iphone version of the database tables if "iphone" is in the path.
$gPagesTable = $gPagesTableIphone;
$gRequestsTable = $gRequestsTableIphone;
$gStatusTable = $gStatusTableIphone;
$gStatsTable = $gStatsTableIphone;
}
else if ( $gbIe ) {
// Use a iphone version of the database tables if "iphone" is in the path.
$gPagesTable = $gPagesTableIe;
$gRequestsTable = $gRequestsTableIe;
$gStatusTable = $gStatusTableIe;
$gStatsTable = $gStatsTableIe;
}
// We restrict the range of queries by adding a date range.
// Sometimes the date range is affected by querystring params.
// We also avoid old, undesired data with this.
function dateRange($bDefault = false) {
global $gbMobile, $gbDev;
// Here's the scoop: We use $dateRange as a nice global throughout most of our queries to limit
// the scope of data. BUT...this is a problem when we want to, for example, present the user with
// a list of labels to choose from - we don't want that list of labels to be limited.
// So in some places we use $dateRangeDefault to get the WHOLE range of data.
$dateRangeDefault = ( isPrivateInstance() ? "pageid >= 1" :
( $gbMobile ? "pageid >= 607" :
( $gbDev ? "pageid >= 10281" : "pageid >= 1" ) ) );
if ( $bDefault ) {
return $dateRangeDefault;
}
else {
// Query ALL the date ranges but avoid some undesired data (old, small runs).
$dateRange = $dateRangeDefault;
if ( minLabel() ) {
$dateRange .= " and pageid >= " . labelMinid(minLabel());
}
if ( maxLabel() ) {
$dateRange .= " and pageid <= " . labelMaxid(maxLabel());
}
return $dateRange;
}
}
function dateRangeCrawls($bDefault = false) {
global $gbMobile, $gbDev;
// Here's the scoop: We use $dateRange as a nice global throughout most of our queries to limit
// the scope of data. BUT...this is a problem when we want to, for example, present the user with
// a list of labels to choose from - we don't want that list of labels to be limited.
// So in some places we use $dateRangeDefault to get the WHOLE range of data.
$dateRangeDefault = ( isPrivateInstance() ? "minPageid >= 1" :
( $gbMobile ? "minPageid >= 607" :
( $gbDev ? "minPageid >= 10281" : "minPageid >= 1" ) ) );
if ( $bDefault ) {
return $dateRangeDefault;
}
else {
// Query ALL the date ranges but avoid some undesired data (old, small runs).
$dateRange = $dateRangeDefault;
if ( minLabel() ) {
$dateRange .= " and minPageid >= " . labelMinid(minLabel());
}
if ( maxLabel() ) {
$dateRange .= " and maxPageid <= " . labelMaxid(maxLabel());
}
return $dateRange;
}
}
function dbtable($table, $device="", $label="", $slice="") {
global $gbChrome, $gbAndroid, $gbIe, $gbIphone, $gbMobile;
if ( FALSE !== stripos($device, "android") || $gbMobile ) {
$table .= "mobile";
}
else if ( isDev() ) {
$table .= "dev";
}
else if ( $gbChrome ) {
$table .= "chrome";
}
else if ( $gbAndroid ) {
$table .= "android";
}
else if ( $gbIe ) {
$table .= "ie";
}
else if ( $gbIphone ) {
$table .= "iphone";
}
return $table;
}
// These are interim helper functions for the massive mess of db table global vars
function pagesTable($label="", $slice="", $device="") {
return dbtable("pages", $device, $label, $slice);
}
// Nothing on the production website should need access to the requests table!!
function requestsTable($label="", $slice="", $device="") {
return dbtable("requests", $device, $label, $slice);
}
function statsTable($label="", $slice="", $device="") {
$table = "stats";
if ( isDev() ) {
$table .= "dev";
}
return $table;
}
function crawlsTable() {
return "crawls";
}
// return a where condition to select the appropriate URL slice based on pageid
function sliceCond($label, $slice, $device, $url = "") {
global $gaTop100, $gaTop1000, $gArchive;
$pagesTable = pagesTable($label, $slice, $device);
$dateRange = dateRange();
$crawl = getCrawl($label, $gArchive, $device);
$minid = $crawl['minPageid'];
$maxid = $crawl['maxPageid'];
$sliceCond = "pageid>=$minid and pageid<=$maxid"; // "All"
if ( 0 === strpos($slice, "Top") ) {
// eg, "Top100" or "Top1000"
if ( 1322727953 <= $crawl['startedDateTime'] ) {
// The first crawl with rank available is "Dec 1 2011" (startedDateTime = 1322727953)
$sliceCond .= " and rank > 0 and rank <= " . substr($slice, 3);
}
else {
// Crawls that occurred BEFORE we started saving rank.
if ( "Top100" === $slice || "Top1000" === $slice ) {
// We'll use some old global variables for Top100 and Top1000.
$urls = ( "Top100" === $slice ? $gaTop100 : $gaTop1000 );
$query = "select pageid from $pagesTable where $sliceCond and url in ('" . implode("','", $urls) . "');";
$result = doQuery($query);
$aPageids = array();
while ( $row = mysqli_fetch_assoc($result) ) {
$aPageids[] = $row['pageid'];
}
mysqli_free_result($result);
$sliceCond = "pageid in (" . implode(",", $aPageids) . ")";
}
else {
// If you've reached this block it means we added something higher
// than "Top1000" (like Top10000 or Top100000) AND we're dealing
// with a crawl that does NOT have ranks. Choices include:
// 1. Include all the pages. These crawls have 50K pages or less,
// so it might not be too far off.
// 2. Only include the top 1000 using the global variable. This
// would mean we'd compare 1000 to 10,000 or 100,000 - bad.
// 3. Use the current rankings from the "urls" table. This would
// mean that the set of top URLs would constantly change.
// I'm going with #1 - include all the pages - which means we don't
// have to add anything more to sliceCond.
}
}
}
else if ( "url" === $slice && isset($url) ) {
$sliceCond .= " and url = '$url'";
}
return $sliceCond;
}
//
//
// MYSQL WRAPPER
//
//
// Reuse a single DB connection for the entire session.
$gDBConnection = null;
function doSimpleCommand($cmd, $link = null) {
global $gMysqlDb;
if ( ! $link ) {
$link = getDBConnection();
}
if ( mysqli_select_db($link, $gMysqlDb) ) {
//dprint("doSimpleCommand: $cmd");
$result = mysqli_query($link, $cmd);
//mysqli_close($link); // the findCorrelation code relies on the link not being closed
if ( ! $result ) {
dprint("ERROR in doSimpleCommand: '" . mysqli_error($link) . "' for command: " . $cmd);
}
}
}
function doQuery($query, $link = null) {
global $gMysqlDb;
if ( ! $link ) {
$link = getDBConnection();
}
if ( mysqli_select_db($link, $gMysqlDb) ) {
//dprint("doQuery: $query");
$result = mysqli_query($link, $query);
//mysqli_close($link); // the findCorrelation code relies on the link not being closed
if ( FALSE === $result ) {
dprint("ERROR in doQuery: '" . mysqli_error($link) . "' for query: " . $query);
}
return $result;
}
return null;
}
// return the first row
function doRowQuery($query, $link = null) {
$row = NULL;
$result = doQuery($query, $link);
if ( $result ) {
$row = mysqli_fetch_assoc($result);
mysqli_free_result($result);
}
return $row;
}
// return the first value from the first row
function doSimpleQuery($query, $link = null) {
$value = NULL;
$result = doQuery($query, $link);
if ( $result ) {
$row = mysqli_fetch_assoc($result);
if ( $row ) {
$aKeys = array_keys($row);
$value = $row[$aKeys[0]];
}
mysqli_free_result($result);
}
return $value;
}
// This takes an INSERT command and returns the result of last_insert_id().
// This is only useful if the INSERT command causes an AUTO_INCREMENT column to be updated.
// We have to do this because last_insert_id only works with the same connection.
// TODO - investigate only using one connection
function doLastInsertId($cmd, $link = null) {
global $gMysqlServer, $gMysqlDb, $gMysqlUsername, $gMysqlPassword;
if ( ! $link ) {
$link = getDBConnection();
}
$last_insert_id = null;
if ( mysqli_select_db($link, $gMysqlDb) ) {
//dprint("doSimpleCommand: $cmd");
$result = mysqli_query($link, $cmd);
//mysqli_close($link); // the findCorrelation code relies on the link not being closed
if ( ! $result ) {
dprint("ERROR in doSimpleCommand: '" . mysqli_error($link) . "' for command: " . $cmd);
}
else {
$last_result = mysqli_query($link, "select last_insert_id()");
$row = mysqli_fetch_array($last_result);
$last_insert_id = $row[0];
}
}
return $last_insert_id;
}
// Return a MySQL DB Connection.
// Cache this in a global variable for efficiency.
function getDBConnection() {
global $gDBConnection, $gMysqlServer, $gMysqlUsername, $gMysqlPassword;
// CVSNO - figure out why forking processes screws up this global variable & how to fix (mysql_ing?)
return mysqli_connect($gMysqlServer, $gMysqlUsername, $gMysqlPassword);
if ( ! $gDBConnection ) {
$gDBConnection = mysqli_connect($gMysqlServer, $gMysqlUsername, $gMysqlPassword);
}
return $gDBConnection;
}
function tableExists($tablename) {
return ( $tablename == doSimpleQuery("show tables like '$tablename';") );
}
// scary
function dropTable($tablename) {
doSimpleCommand("drop table $tablename;");
}
function columnExists($column, $table) {
// We reference the "Field" value returned from "show columns".
return doSimpleQuery("show columns from $table where Field = '$column';");
}
/*******************************************************************************
SCHEMA CHANGES:
This is a record of changes to the schema and how the tables were updated
in place.
12/1/10 - Added the "pageid" index to requestsdev.
This made the aggregateStats function 10x faster during import.
mysql> create index pageid on requestsdev (pageid);
7/21/2011 - Added the "rank" column to pages table:
mysql> alter table pagesdev add column rank int(10) unsigned after PageSpeed;
Oct 27, 2011 - Remove "harfile" column and reset unique index. Shorten columns.
mysql> alter table pagesdev modify column archive varchar (16) not null;
mysql> alter table pagesdev modify column label varchar (32) not null;
mysql> drop index startedDateTime on pagesdev;
mysql> alter table pagesdev add unique key (label, urlShort);
mysql> alter table pagesdev drop column harfile;
Nov 3, 2011 - add "rank" column to status table
mysql> drop table statusdev; drop table statusmobile;
Nov 16, 2011 - add perFonts column to stats table
mysql> alter table statsdev add column perFonts int(4) unsigned after perFlash;
April 5, 2012 - add urlhash to urls table for faster lookup
("urlhash" is a substring of the URL's MD5 hash converted to base-10)
mysql> alter table urls add column urlhash int(8) unsigned first;
mysql> update urls set urlhash = conv(substring(md5(urlOrig),1,4),16,10);
mysql> create index urlhash on urls (urlhash);
Oct 26, 2012 - add new stats columns
mysql> alter table statsdev add column renderStart int(10) unsigned after PageSpeed;
mysql> alter table statsdev add column onLoad int(10) unsigned after renderStart;
mysql> alter table statsdev add column perCompressed int(4) unsigned after perHttps;
mysql> alter table statsdev modify perHttps int(4) unsigned;
Dec 2012:
alter table pages drop column title;
[pagesdev, pagesmobile]
alter table pages drop column urlHtmlShort;
[pagesdev, pagesmobile]
alter table requests drop column redirectUrlShort;
[requestsdev, requestsmobile]
alter table pages drop column urlHtml;
[pagesdev, pagesmobile]
alter table pages add column TTFB smallint unsigned AFTER startedDateTime;
[pagesdev, pagesmobile]
alter table pages add column reqGif smallint unsigned not null after reqImg, ADD COLUMN reqJpg smallint unsigned not null after reqGif, ADD COLUMN reqPng smallint unsigned not null after reqJpg, ADD COLUMN reqFont smallint unsigned not null after reqPng;
[pagesdev, pagesmobile]
alter table pages add column bytesGif int(10) unsigned not null after bytesImg, ADD COLUMN bytesJpg int(10) unsigned not null after bytesGif, ADD COLUMN bytesPng int(10) unsigned not null after bytesJpg, ADD COLUMN bytesFont int(10) unsigned not null after bytesPng;
[pagesdev, pagesmobile]
alter table pages add column maxageMore smallint unsigned not null after numDomains, add column maxage365 smallint unsigned not null after numDomains, add column maxage30 smallint unsigned not null after numDomains, add column maxage1 smallint unsigned not null after numDomains, add column maxage0 smallint unsigned not null after numDomains, add column maxageNull smallint unsigned not null after numDomains, add column numDomElements mediumint unsigned not null after numDomains, add column numCompressed smallint unsigned not null after numDomains, add column numHttps smallint unsigned not null after numDomains, add column numGlibs smallint unsigned not null after numDomains, add column numErrors smallint unsigned not null after numDomains, add column numRedirects smallint unsigned not null after numDomains, add column maxDomainReqs smallint unsigned not null after numDomains;
[pagesdev, pagesmobile]
alter table pages add column bytesHtmlDoc mediumint unsigned not null AFTER bytesOther;
[pagesdev, pagesmobile]
alter table pages add column fullyLoaded int(10) unsigned AFTER onLoad;
[pagesdev, pagesmobile]
alter table pages add column cdn varchar (64) AFTER urlShort;
[pagesdev, pagesmobile]
alter table pages add column SpeedIndex mediumint unsigned AFTER PageSpeed;
[pagesdev, pagesmobile]
alter table pages add column visualComplete int(10) unsigned AFTER fullyLoaded;
[pagesdev, pagesmobile]
alter table pages add column gzipTotal int unsigned not null AFTER maxageMore, add column gzipSavings int unsigned not null after gzipTotal;
[pagesdev, pagesmobile]
alter table requests add column expAge int unsigned AFTER respCookieLen;
[requestsdev, requestsmobile]
Jan 7, 2013 - add reqFont, bytesFont columns to stats table
alter table statsdev add column reqFont float unsigned after reqFlash;
alter table statsdev add column bytesFont int(10) unsigned after bytesFlash;
Jan 9 2013 - make expAge not null
alter table requests modify expAge int unsigned not null;
Feb 5 2013 - add new stats to stats table
alter table stats add column bytesHtmlDoc mediumint unsigned after bytesOther, add column perCdn int(4) unsigned after perGlibs, add column gzipSavings int unsigned not null after bytesHtmlDoc, add column gzipTotal int unsigned not null after bytesHtmlDoc, add column numDomElements mediumint unsigned not null after numDomains, add column maxDomainReqs smallint unsigned not null after numDomains, add column fullyLoaded int(10) unsigned after onLoad, add column visualComplete int(10) unsigned after onLoad, add column TTFB int(10) unsigned after PageSpeed, add column SpeedIndex mediumint unsigned after PageSpeed;
[statsdev]
July 2013 -
pages:
add columns: crawlid, urlhash
requests:
crawls:
add columns: notes
stats:
add columns: crawlid
April 20, 2014 - add crawlid index
create index crawlid on pages (crawlid);
May 12, 2014 - add _connections and _cdn_provider column
alter table pages add column _connections int(4) unsigned not null;
alter table requests add column _cdn_provider varchar(64);
alter table stats add column _connections int(4) unsigned not null;
May 13, 2014 - add _adult_site
alter table pages add column _adult_site boolean not null;
May 21, 2014 - add _gzip_save
alter table requests add column _gzip_save int(10) unsigned not null;
May 29, 2014 - add new columns based on custom_metrics feature in WebPagetest
alter table pages add column avg_dom_depth int(4) unsigned not null,
add column document_height int(10) unsigned not null,
add column document_width int(10) unsigned not null,
add column localstorage_size int(10) unsigned not null,
add column sessionstorage_size int(10) unsigned not null,
add column num_iframes int(8) unsigned not null,
add column num_scripts int(8) unsigned not null,
add column doctype varchar(255) not null,
add column meta_viewport varchar(255) not null;
alter table stats add column avg_dom_depth int(4) unsigned not null,
add column document_height int(10) unsigned not null,
add column document_width int(10) unsigned not null,
add column localstorage_size int(10) unsigned not null,
add column sessionstorage_size int(10) unsigned not null,
add column num_iframes int(8) unsigned not null,
add column num_scripts int(8) unsigned not null;
June ?, 2014 - add crawlid to requests
alter table requests add column crawlid int unsigned not null;
//TODO create index crawlid on requests (crawlid);
April 23, 2015 - add ext, type, format to requests, and various req & bytes columns to pages & stats.
alter table requests[] add column type varchar (32);
alter table requests[] add column ext varchar (8);
alter table requests[] add column format varchar (8);
alter table pages[] add column reqAudio int(4) unsigned not null;
alter table pages[] add column reqVideo int(4) unsigned not null;
alter table pages[] add column reqText int(4) unsigned not null;
alter table pages[] add column reqXml int(4) unsigned not null;
alter table pages[] add column reqWebp int(4) unsigned not null;
alter table pages[] add column reqSvg int(4) unsigned not null;
alter table pages[] add column bytesAudio int(10) unsigned not null;
alter table pages[] add column bytesVideo int(10) unsigned not null;
alter table pages[] add column bytesText int(10) unsigned not null;
alter table pages[] add column bytesXml int(10) unsigned not null;
alter table pages[] add column bytesWebp int(10) unsigned not null;
alter table pages[] add column bytesSvg int(10) unsigned not null;
alter table stats[dev] add column reqAudio float unsigned;
alter table stats[dev] add column bytesAudio int(10) unsigned;
alter table stats[dev] add column reqVideo float unsigned;
alter table stats[dev] add column bytesVideo int(10) unsigned;
alter table stats[dev] add column reqText float unsigned;
alter table stats[dev] add column bytesText int(10) unsigned;
alter table stats[dev] add column reqXml float unsigned;
alter table stats[dev] add column bytesXml int(10) unsigned;
alter table stats[dev] add column reqWebp float unsigned;
alter table stats[dev] add column bytesWebp int(10) unsigned;
alter table stats[dev] add column reqSvg float unsigned;
alter table stats[dev] add column bytesSvg int(10) unsigned;
May 29, 2015 - add new columns for custom_metrics to pages[dev, mobile, chrome, android]:
- allow null so we can see if script is failing - not sure if this will cause problems
alter table pages add column num_scripts_async int(8) unsigned,
add column num_scripts_sync int(8) unsigned,
add column usertiming int(8) unsigned;
alter table stats add column num_scripts_async int(8) unsigned not null,
add column num_scripts_sync int(8) unsigned not null,
add column usertiming int(8) unsigned not null;
Dec 9, 2015 - add new columns for SpeedIndex correlations
alter table stats add column SpeedIndexccf1 varchar (32),
add column SpeedIndexccv1 float unsigned,
add column SpeedIndexccf2 varchar (32),
add column SpeedIndexccv2 float unsigned,
add column SpeedIndexccf3 varchar (32),
add column SpeedIndexccv3 float unsigned,
add column SpeedIndexccf4 varchar (32),
add column SpeedIndexccv4 float unsigned,
add column SpeedIndexccf5 varchar (32),
add column SpeedIndexccv5 float unsigned;
*******************************************************************************/
function createTables() {
global $gPagesTable, $gRequestsTable, $gStatusTable, $gStatsTable, $gUrlsTable, $gUrlsChangeTable, $gSettingsTable, $gCdfTable;
if ( ! tableExists($gPagesTable) ) {
$command = "create table $gPagesTable (" .
"pageid int unsigned not null auto_increment" .
", createDate int(10) unsigned not null" .
", archive varchar (16) not null" .
", label varchar (32) not null" . // name of the crawl, eg, "Sep 15 2012"
", crawlid int unsigned not null" . // crawlid joins with "crawls" table
", wptid varchar (64) not null" . // webpagetest.org id
", wptrun int(2) unsigned not null" . // webpagetest.org median #
", url text" . // main URL fetched
", urlShort varchar (255)" . // shortened URL to use in index
", urlhash int(8) unsigned" . // hash for faster searching - use getUrlhashCond()
", cdn varchar (64)" . // if the base page is served by a CDN
", startedDateTime int(10) unsigned" . // epoch time when page was crawled
", TTFB smallint unsigned" . // time to first byte of HTML doc response
", renderStart int(10) unsigned" . // when rendering started
", onContentLoaded int(10) unsigned" . // when content loaded (not available in IE)
", onLoad int(10) unsigned" . // window.onload
", fullyLoaded int(10) unsigned" . // the page is fully done (according to WPT)
", visualComplete int(10) unsigned" . //
", PageSpeed int(4) unsigned" . // PageSpeed score (0-100)
", SpeedIndex mediumint unsigned" . // WPT Speed Index score
", rank int(10) unsigned" . // worldwide rank for this URL (according to Alexa.com)
", reqTotal int(4) unsigned not null" . // # of resource requests, etc.
", reqHtml int(4) unsigned not null" .
", reqJS int(4) unsigned not null" .
", reqCSS int(4) unsigned not null" .
", reqImg int(4) unsigned not null" .
", reqGif smallint unsigned not null" .
", reqJpg smallint unsigned not null" .
", reqPng smallint unsigned not null" .
", reqFont smallint unsigned not null" .
", reqFlash int(4) unsigned not null" .
", reqJson int(4) unsigned not null" .
", reqOther int(4) unsigned not null" .
", bytesTotal int(10) unsigned not null" . // # of bytes TRANSFERRED (so may be bigger when uncompressed)
", bytesHtml int(10) unsigned not null" .
", bytesJS int(10) unsigned not null" .
", bytesCSS int(10) unsigned not null" .
", bytesImg int(10) unsigned not null" .
", bytesGif int(10) unsigned not null" .
", bytesJpg int(10) unsigned not null" .
", bytesPng int(10) unsigned not null" .
", bytesFont int(10) unsigned not null" .
", bytesFlash int(10) unsigned not null" .
", bytesJson int(10) unsigned not null" .
", bytesOther int(10) unsigned not null" .
", bytesHtmlDoc mediumint unsigned not null" . // size of the main HTML document
", numDomains int(4) unsigned" . // # of unique domains across all requests
", maxDomainReqs smallint unsigned not null" . // # of requests on the most-used domain
", numRedirects smallint unsigned not null" .
", numErrors smallint unsigned not null" .
", numGlibs smallint unsigned not null" . // # of requests to Google Libraries API
", numHttps smallint unsigned not null" .
", numCompressed smallint unsigned not null" .
", numDomElements mediumint unsigned not null" .
", maxageNull smallint unsigned not null" . // # of responses without a max-age value in Cache-Control
", maxage0 smallint unsigned not null" . // # of responses with max-age=0
", maxage1 smallint unsigned not null" . // # of responses with 0 < max-age <= 1 day
", maxage30 smallint unsigned not null" . // # of responses with 1 < max-age <= 30 days
", maxage365 smallint unsigned not null" . // # of responses with 30 < max-age <= 365 days
", maxageMore smallint unsigned not null" . // # of responses with 365 < max-age
", gzipTotal int unsigned not null" . // # of bytes xferred for resources that COULD have been gzipped
", gzipSavings int unsigned not null" . // # of bytes that could have been saved if ALL gzippable resources were gzipped
", _connections int(4) unsigned not null" . // # of TCP connections created during page load
", _adult_site boolean not null" .
", avg_dom_depth int(4) unsigned not null" .
", document_height int(10) unsigned not null" .
", document_width int(10) unsigned not null" .
", localstorage_size int(10) unsigned not null" .
", sessionstorage_size int(10) unsigned not null" .
", num_iframes int(8) unsigned not null" .
", num_scripts int(8) unsigned not null" .
", doctype varchar(255) not null" .
", meta_viewport varchar(255) not null" .
", reqAudio int(4) unsigned not null" .
", reqVideo int(4) unsigned not null" .
", reqText int(4) unsigned not null" .
", reqXml int(4) unsigned not null" .
", reqWebp int(4) unsigned not null" .
", reqSvg int(4) unsigned not null" .
", bytesAudio int(10) unsigned not null" .
", bytesVideo int(10) unsigned not null" .
", bytesText int(10) unsigned not null" .
", bytesXml int(10) unsigned not null" .
", bytesWebp int(10) unsigned not null" .
", bytesSvg int(10) unsigned not null" .
", num_scripts_async int(8) unsigned" .
", num_scripts_sync int(8) unsigned" .
", usertiming int(8) unsigned" .
", primary key (pageid)" .
", index(urlhash)" .
", index(crawlid)" .
", unique key (label, urlShort)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
if ( ! tableExists($gRequestsTable) ) {
createRequestsTable($gRequestsTable);
}
// Create Status Table
if ( ! tableExists($gStatusTable) ) {
$command = "create table $gStatusTable (" .
"statusid int unsigned not null auto_increment" .
", crawlid int unsigned not null" . // crawlid joins with "crawls" table
", url text" .
", location varchar (32) not null" .
", archive varchar (32) not null" .
", label varchar (32) not null" .
", rank int(10) unsigned" .
", status varchar (32) not null" .
", attempts int(2) not null" . // how many times we've tried the URL so we can re-submit failures
", timeOfLastChange int(10) unsigned not null" .
", wptid varchar (64)" .
", wptRetCode varchar (8)" .
", medianRun int(4) unsigned" .
", startRender int(10) unsigned" .
", pagespeedScore int(4) unsigned" .
", primary key (statusid)" .
", index(statusid)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
// Create Status Table
$crawlsTable = crawlsTable();
if ( ! tableExists( $crawlsTable ) ) {
$command = "create table $crawlsTable (" .
"crawlid int unsigned not null auto_increment" .
", label varchar (32) not null" .
", archive varchar (32) not null" .
", location varchar (32) not null" . // AKA "device" - the WPT location value
", notes varchar(64)" . // notes like "test" or "js off"
", video boolean not null" . // true = video was recorded
", docComplete boolean not null" . // true = stopped at onload; false = ran later
", fvonly boolean not null" . // true = first view only
", runs int(4) unsigned" . // the number of runs for each URL (we use the median)
", startedDateTime int(10) unsigned" .
", finishedDateTime int(10) unsigned" .
", timeOfLastChange int(10) unsigned not null" .
", passes int(2) unsigned" . // the number of passes thru the URLs - we typically do 2 (retry errors)
", minPageid int unsigned not null" .
", maxPageid int unsigned not null" .
", numUrls int unsigned" . // # of URLs submitted
", numErrors int unsigned" . // # of URLs that failed
", numPages int unsigned" . // # of URLs that resulted in valid pages
", numRequests int unsigned" . // # of requests for all the pages
", primary key (crawlid)" .
", index(label, archive, location)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
if ( ! tableExists($gStatsTable) ) {
$command = "create table $gStatsTable (" .
"label varchar (32) not null" . // "Oct 1 2011"
", crawlid int unsigned not null" . // crawlid joins with "crawls" table
", slice varchar (32) not null" . // "Top100", "Top1000", "intersection", or "All"
", device varchar (32) not null" . // "iphone43", "isim", or "IE8"
", version int(6) unsigned not null" . // the SVN version # so we can track what values to recompute
", numurls int unsigned" . // even "Top100" might only be 98 URLs
// averages across ALL SITES in the slice
", PageSpeed float unsigned" .
", SpeedIndex mediumint unsigned" . // WPT Speed Index score
", TTFB int(10) unsigned" .
", renderStart int(10) unsigned" .
", onLoad int(10) unsigned" .
", visualComplete int(10) unsigned" .
", fullyLoaded int(10) unsigned" .
", numDomains float unsigned" .
", maxDomainReqs smallint unsigned not null" .
", numDomElements mediumint unsigned not null" .
", reqTotal float unsigned" .
", reqHtml float unsigned" .
", reqJS float unsigned" .
", reqCSS float unsigned" .
", reqImg float unsigned" .
", reqGif float unsigned" .
", reqJpg float unsigned" .
", reqPng float unsigned" .
", reqFlash float unsigned" .
", reqFont float unsigned" .
", reqJson float unsigned" .
", reqOther float unsigned" .
", bytesTotal int(10) unsigned" .
", bytesHtml int(10) unsigned" .
", bytesJS int(10) unsigned" . // eg, average total bytes of JS downloaded per page
", bytesCSS int(10) unsigned" .
", bytesImg int(10) unsigned" .
", bytesGif int(10) unsigned" .
", bytesJpg int(10) unsigned" .
", bytesPng int(10) unsigned" .
", bytesFlash int(10) unsigned" .
", bytesFont int(10) unsigned" .
", bytesJson int(10) unsigned" .
", bytesOther int(10) unsigned" .
", bytesHtmlDoc mediumint unsigned" . // size of the main HTML document
", gzipTotal int unsigned not null" . // avg # of bytes xferred for resources that COULD have been gzipped
", gzipSavings int unsigned not null" . // avg # of bytes that could have been saved if ALL gzippable resources were gzipped
// % of sites with at least one of these conditions
", perRedirects int(4) unsigned" .
", perErrors int(4) unsigned" .
", perFlash int(4) unsigned" .
", perFonts int(4) unsigned" .
", perGlibs int(4) unsigned" .
", perCdn int(4) unsigned" .
/*
// % of sites that use this JS library
", perjQuery float unsigned" .
", perYUI float unsigned" .
", perDojo float unsigned" .
", perGA float unsigned" .
", perQuantcast float unsigned" .
", perAddThis float unsigned" .
", perFacebook float unsigned" .
", perGPlusOne float unsigned" .
", perTwitter float unsigned" .
", perShareThis float unsigned" .
*/
// % of requests
", perHttps int(4) unsigned" .
", perCompressed int(4) unsigned" .
// % of sites with max-age in this range
// "maxage30" === maxage > 1 day AND maxage <= 30 days
", maxageNull int(4) unsigned" .
", maxage0 int(4) unsigned" .
", maxage1 int(4) unsigned" .
", maxage30 int(4) unsigned" .
", maxage365 int(4) unsigned" .
", maxageMore int(4) unsigned" .
// correlation coefficients: top 5 correlations for onload and render
", onLoadccf1 varchar (32)" .
", onLoadccv1 float unsigned" .
", onLoadccf2 varchar (32)" .
", onLoadccv2 float unsigned" .
", onLoadccf3 varchar (32)" .
", onLoadccv3 float unsigned" .
", onLoadccf4 varchar (32)" .
", onLoadccv4 float unsigned" .
", onLoadccf5 varchar (32)" .
", onLoadccv5 float unsigned" .
", renderStartccf1 varchar (32)" .
", renderStartccv1 float unsigned" .
", renderStartccf2 varchar (32)" .
", renderStartccv2 float unsigned" .
", renderStartccf3 varchar (32)" .
", renderStartccv3 float unsigned" .
", renderStartccf4 varchar (32)" .
", renderStartccv4 float unsigned" .
", renderStartccf5 varchar (32)" .
", renderStartccv5 float unsigned" .
// columns added later
", _connections int(4) unsigned not null" .
", avg_dom_depth int(4) unsigned not null" .
", document_height int(10) unsigned not null" .
", document_width int(10) unsigned not null" .
", localstorage_size int(10) unsigned not null" .
", sessionstorage_size int(10) unsigned not null" .
", num_iframes int(8) unsigned not null" .
", num_scripts int(8) unsigned not null" .
", reqAudio float unsigned" .
", bytesAudio int(10) unsigned" .
", reqVideo float unsigned" .
", bytesVideo int(10) unsigned" .
", reqText float unsigned" .
", bytesText int(10) unsigned" .
", reqXml float unsigned" .
", bytesXml int(10) unsigned" .
", reqWebp float unsigned" .
", bytesWebp int(10) unsigned" .
", reqSvg float unsigned" .
", bytesSvg int(10) unsigned" .
", num_scripts_sync int(8) unsigned not null" .
", num_scripts_async int(8) unsigned not null" .
", usertiming int(8) unsigned not null" .
", SpeedIndexccf1 varchar (32)" .
", SpeedIndexccv1 float unsigned" .
", SpeedIndexccf2 varchar (32)" .
", SpeedIndexccv2 float unsigned" .
", SpeedIndexccf3 varchar (32)" .
", SpeedIndexccv3 float unsigned" .
", SpeedIndexccf4 varchar (32)" .
", SpeedIndexccv4 float unsigned" .
", SpeedIndexccf5 varchar (32)" .
", SpeedIndexccv5 float unsigned" .
", primary key (label, slice, device)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
// Create Urls Table
if ( ! tableExists($gUrlsTable) ) {
$command = "create table $gUrlsTable (" .
"timeAdded int(10) unsigned not null" . // epoch time when this URL was added to the list
", urlhash int(8) unsigned" . // hash for faster searching - use getUrlhashCond()
", urlOrig text character set binary" . // use "binary" so it's case *sensitive*
", urlFixed text" .
", rank int(10) unsigned" .
", ranktmp int(10) unsigned" . // use this while we're updating the rank every night
", other boolean not null" . // is this URL from some other source and we should crawl it EVERY time
", optout boolean not null" . // did the website owner ask us NOT to crawl their site
", primary key (urlOrig(255))" .
", index(urlhash)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
// Create CDF table.
if ( ! tableExists($gCdfTable) ) {
$command = "create table $gCdfTable (" .
"crawlid int unsigned not null" . // crawlid joins with "crawls" table
", slice varchar (32) not null" . // "Top100", "Top1000", "intersection", or "All"
", stat varchar (32) not null"; // the stat for this CDF (eg, PageSpeed, or bytesTotal)
// add columns for all percentages 1-100
for ( $i = 1; $i <= 100; $i++ ) {
$command .= ", percentile$i int unsigned not null";
}
$command .= ", primary key (crawlid, slice, stat)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
// Create Urls Change Table
// If someone chooses to have their site removed, we queue the request here.
if ( ! tableExists($gUrlsChangeTable) ) {
$command = "create table $gUrlsChangeTable (" .
"url text character set binary" . // use "binary" so it's case *sensitive*
", action varchar(16)" . // "add", "remove" for now
", createDate int(10) unsigned not null" .
", primary key (url(255), action)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
// Create Settings Table
if ( ! tableExists($gSettingsTable) ) {
$command = "create table $gSettingsTable (" .
"setting varchar (64)" .
", val varchar (255)" .
", primary key (setting)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
}
function createRequestsTable($tablename, $aMergeTables=null) {
global $ghReqHeaders, $ghRespHeaders;
$sColumns = "";
$aColumns = array_values($ghReqHeaders);
sort($aColumns);
for ( $i = 0; $i < count($aColumns); $i++ ) {
$column = $aColumns[$i];
$sColumns .= ", $column varchar (255)";
}
$aColumns = array_values($ghRespHeaders);
sort($aColumns);
for ( $i = 0; $i < count($aColumns); $i++ ) {
$column = $aColumns[$i];
$sColumns .= ", $column varchar (255)";
}