collectstats.pl 17.8 KB
Newer Older
1
#!/usr/bin/perl -w
2 3
# -*- Mode: perl; indent-tabs-mode: nil -*-
#
4 5 6 7 8 9 10 11 12 13
# The contents of this file are subject to the Mozilla Public
# License Version 1.1 (the "License"); you may not use this file
# except in compliance with the License. You may obtain a copy of
# the License at http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS
# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
# implied. See the License for the specific language governing
# rights and limitations under the License.
#
14
# The Original Code is the Bugzilla Bug Tracking System.
15
#
16
# The Initial Developer of the Original Code is Netscape Communications
17 18 19 20
# Corporation. Portions created by Netscape are
# Copyright (C) 1998 Netscape Communications Corporation. All
# Rights Reserved.
#
21 22
# Contributor(s): Terry Weissman <terry@mozilla.org>,
#                 Harrison Page <harrison@netscape.com>
23 24 25
#                 Gervase Markham <gerv@gerv.net>
#                 Richard Walters <rwalters@qualcomm.com>
#                 Jean-Sebastien Guay <jean_seb@hybride.com>
26
#                 Frédéric Buclin <LpSolit@gmail.com>
27 28

# Run me out of cron at midnight to collect Bugzilla statistics.
29 30 31
#
# To run new charts for a specific date, pass it in on the command line in
# ISO (2004-08-14) format.
32

33
use strict;
34 35 36
use lib qw(. lib);

use List::Util qw(first);
37
use Cwd;
38

39
use Bugzilla;
40
use Bugzilla::Constants;
41
use Bugzilla::Error;
42
use Bugzilla::Util;
43 44 45
use Bugzilla::Search;
use Bugzilla::User;
use Bugzilla::Product;
46
use Bugzilla::Field;
47

48
# Turn off output buffering (probably needed when displaying output feedback
49
# in the regenerate mode).
50 51 52
$| = 1;

# Tidy up after graphing module
53
my $cwd = Cwd::getcwd();
54
if (chdir("graphs")) {
55
    unlink <./*.gif>;
56
    unlink <./*.png>;
57 58
    # chdir("..") doesn't work if graphs is a symlink, see bug 429378
    chdir($cwd);
59
}
60

61
my $dbh = Bugzilla->switch_to_shadow_db();
62

63

64 65 66
# To recreate the daily statistics,  run "collectstats.pl --regenerate" .
my $regenerate = 0;
if ($#ARGV >= 0 && $ARGV[0] eq "--regenerate") {
67
    shift(@ARGV);
68 69 70
    $regenerate = 1;
}

71 72
my $datadir = bz_locations()->{'datadir'};

73
my @myproducts = map {$_->name} Bugzilla::Product->get_all;
74
unshift(@myproducts, "-All-");
75

76 77
# As we can now customize statuses and resolutions, looking at the current list
# of legal values only is not enough as some now removed statuses and resolutions
78
# may have existed in the past, or have been renamed. We want them all.
79 80 81 82 83
my $fields = {};
foreach my $field ('bug_status', 'resolution') {
    my $values = get_legal_field_values($field);
    my $old_values = $dbh->selectcol_arrayref(
                             "SELECT bugs_activity.added
84 85
                                FROM bugs_activity
                          INNER JOIN fielddefs
86
                                  ON fielddefs.id = bugs_activity.fieldid
87 88
                           LEFT JOIN $field
                                  ON $field.value = bugs_activity.added
89
                               WHERE fielddefs.name = ?
90
                                 AND $field.id IS NULL
91 92 93 94 95 96

                               UNION

                              SELECT bugs_activity.removed
                                FROM bugs_activity
                          INNER JOIN fielddefs
97
                                  ON fielddefs.id = bugs_activity.fieldid
98 99
                           LEFT JOIN $field
                                  ON $field.value = bugs_activity.removed
100
                               WHERE fielddefs.name = ?
101 102
                                 AND $field.id IS NULL",
                               undef, ($field, $field));
103

104 105 106 107 108 109
    push(@$values, @$old_values);
    $fields->{$field} = $values;
}

my @statuses = @{$fields->{'bug_status'}};
my @resolutions = @{$fields->{'resolution'}};
110 111 112
# Exclude "" from the resolution list.
@resolutions = grep {$_} @resolutions;

113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
# --regenerate was taking an enormous amount of time to query everything
# per bug, per day. Instead, we now just get all the data out of the DB
# at once and stuff it into some data structures.
my (%bug_status, %bug_resolution, %removed);
if ($regenerate) {
    %bug_resolution = @{ $dbh->selectcol_arrayref(
        'SELECT bug_id, resolution FROM bugs', {Columns=>[1,2]}) };
    %bug_status = @{ $dbh->selectcol_arrayref(
        'SELECT bug_id, bug_status FROM bugs', {Columns=>[1,2]}) };

    my $removed_sth = $dbh->prepare(
        q{SELECT bugs_activity.bug_id, bugs_activity.removed,}
        . $dbh->sql_to_days('bugs_activity.bug_when')
        . q{FROM bugs_activity
           WHERE bugs_activity.fieldid = ?
        ORDER BY bugs_activity.bug_when});

    %removed = (bug_status => {}, resolution => {});
    foreach my $field (qw(bug_status resolution)) {
        my $field_id = Bugzilla::Field->check($field)->id;
        my $rows = $dbh->selectall_arrayref($removed_sth, undef, $field_id);
        my $hash = $removed{$field};
        foreach my $row (@$rows) {
            my ($bug_id, $removed, $when) = @$row;
            $hash->{$bug_id} ||= [];
            push(@{ $hash->{$bug_id} }, { when    => int($when),
                                          removed => $removed });
        }
    }
}

144
my $tstart = time;
145
foreach (@myproducts) {
146
    my $dir = "$datadir/mining";
147

148
    &check_data_dir ($dir);
149

150
    if ($regenerate) {
151
        regenerate_stats($dir, $_, \%bug_resolution, \%bug_status, \%removed);
152 153 154
    } else {
        &collect_stats($dir, $_);
    }
155
}
156
my $tend = time;
157 158
# Uncomment the following line for performance testing.
#print "Total time taken " . delta_time($tstart, $tend) . "\n";
159

160 161
CollectSeriesData();

162 163
sub check_data_dir {
    my $dir = shift;
164

165
    if (! -d $dir) {
166 167
        mkdir $dir, 0755;
        chmod 0755, $dir;
168 169
    }
}
170

171 172 173 174
sub collect_stats {
    my $dir = shift;
    my $product = shift;
    my $when = localtime (time);
175
    my $dbh = Bugzilla->dbh;
176

177 178 179 180 181
    my $product_id;
    if ($product ne '-All-') {
        my $prod = Bugzilla::Product::check_product($product);
        $product_id = $prod->id;
    }
182

183 184 185 186 187
    # NB: Need to mangle the product for the filename, but use the real
    # product name in the query
    my $file_product = $product;
    $file_product =~ s/\//-/gs;
    my $file = join '/', $dir, $file_product;
188 189
    my $exists = -f $file;

190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
    # if the file exists, get the old status and resolution list for that product.
    my @data;
    @data = get_old_data($file) if $exists;

    # If @data is not empty, then we have to recreate the data file.
    if (scalar(@data)) {
        open(DATA, '>', $file)
          || ThrowCodeError('chart_file_open_fail', {'filename' => $file});
    }
    else {
        open(DATA, '>>', $file)
          || ThrowCodeError('chart_file_open_fail', {'filename' => $file});
    }

    # Now collect current data.
    my @row = (today());
    my $status_sql = q{SELECT COUNT(*) FROM bugs WHERE bug_status = ?};
    my $reso_sql   = q{SELECT COUNT(*) FROM bugs WHERE resolution = ?};

    if ($product ne '-All-') {
        $status_sql .= q{ AND product_id = ?};
        $reso_sql   .= q{ AND product_id = ?};
    }

    my $sth_status = $dbh->prepare($status_sql);
    my $sth_reso   = $dbh->prepare($reso_sql);

    my @values ;
    foreach my $status (@statuses) {
        @values = ($status);
        push (@values, $product_id) if ($product ne '-All-');
        my $count = $dbh->selectrow_array($sth_status, undef, @values);
        push(@row, $count);
    }
    foreach my $resolution (@resolutions) {
        @values = ($resolution);
        push (@values, $product_id) if ($product ne '-All-');
        my $count = $dbh->selectrow_array($sth_reso, undef, @values);
        push(@row, $count);
    }
230

231 232 233
    if (!$exists || scalar(@data)) {
        my $fields = join('|', ('DATE', @statuses, @resolutions));
        print DATA <<FIN;
234
# Bugzilla Daily Bug Stats
235
#
236
# Do not edit me! This file is generated.
237
#
238
# fields: $fields
239 240
# Product: $product
# Created: $when
241
FIN
242
    }
243

244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
    # Add existing data, if needed. Note that no count is not treated
    # the same way as a count with 0 bug.
    foreach my $data (@data) {
        print DATA join('|', map {defined $data->{$_} ? $data->{$_} : ''}
                                 ('DATE', @statuses, @resolutions)) . "\n";
    }
    print DATA (join '|', @row) . "\n";
    close DATA;
    chmod 0644, $file;
}

sub get_old_data {
    my $file = shift;

    open(DATA, '<', $file)
      || ThrowCodeError('chart_file_open_fail', {'filename' => $file});

    my @data;
    my @columns;
    my $recreate = 0;
    while (<DATA>) {
        chomp;
        next unless $_;
        if (/^# fields?:\s*(.+)\s*$/) {
            @columns = split(/\|/, $1);
            # Compare this list with @statuses and @resolutions.
            # If they are identical, then we can safely append new data
            # to the end of the file; else we have to recreate it.
            $recreate = 1;
            my @new_cols = ($columns[0], @statuses, @resolutions);
            if (scalar(@columns) == scalar(@new_cols)) {
275 276 277 278 279
                my $identical = 1;
                for (0 .. $#columns) {
                    $identical = 0 if ($columns[$_] ne $new_cols[$_]);
                }
                last if $identical;
280 281 282 283 284 285 286 287 288 289 290 291
            }
        }
        next unless $recreate;
        next if (/^#/); # Ignore comments.
        # If we have to recreate the file, we have to load all existing
        # data first.
        my @line = split /\|/;
        my %data;
        foreach my $column (@columns) {
            $data{$column} = shift @line;
        }
        push(@data, \%data);
292
    }
293 294
    close(DATA);
    return @data;
295 296
}

297 298
# This regenerates all statistics from the database.
sub regenerate_stats {
299
    my ($dir, $product, $bug_resolution, $bug_status, $removed) = @_;
300

301 302
    my $dbh = Bugzilla->dbh;
    my $when = localtime(time());
303 304 305 306 307 308 309 310 311 312
    my $tstart = time();

    # NB: Need to mangle the product for the filename, but use the real
    # product name in the query
    my $file_product = $product;
    $file_product =~ s/\//-/gs;
    my $file = join '/', $dir, $file_product;

    my $and_product = "";
    my $from_product = "";
313 314

    my @values = ();
315
    if ($product ne '-All-') {
316 317
        $and_product = q{ AND products.name = ?};
        $from_product = q{ INNER JOIN products 
318 319 320 321
                          ON bugs.product_id = products.id};
        push (@values, $product);
    }

322 323 324
    # Determine the start date from the date the first bug in the
    # database was created, and the end date from the current day.
    # If there were no bugs in the search, return early.
325
    my $query = q{SELECT } .
326 327
                $dbh->sql_to_days('creation_ts') . q{ AS start_day, } .
                $dbh->sql_to_days('current_date') . q{ AS end_day, } . 
328 329 330 331
                $dbh->sql_to_days("'1970-01-01'") . 
                 qq{ FROM bugs $from_product 
                   WHERE } . $dbh->sql_to_days('creation_ts') . 
                         qq{ IS NOT NULL $and_product 
332
                ORDER BY start_day } . $dbh->sql_limit(1);
333 334
    my ($start, $end, $base) = $dbh->selectrow_array($query, undef, @values);

335 336 337
    if (!defined $start) {
        return;
    }
338

339
    if (open DATA, ">$file") {
340
        my $fields = join('|', ('DATE', @statuses, @resolutions));
341 342 343 344 345
        print DATA <<FIN;
# Bugzilla Daily Bug Stats
#
# Do not edit me! This file is generated.
#
346
# fields: $fields
347 348 349 350 351
# Product: $product
# Created: $when
FIN
        # For each day, generate a line of statistics.
        my $total_days = $end - $start;
352
        my @bugs;
353 354 355 356 357 358 359
        for (my $day = $start + 1; $day <= $end; $day++) {
            # Some output feedback
            my $percent_done = ($day - $start - 1) * 100 / $total_days;
            printf "\rRegenerating $product \[\%.1f\%\%]", $percent_done;

            # Get a list of bugs that were created the previous day, and
            # add those bugs to the list of bugs for this product.
360 361 362 363 364 365 366 367 368 369
            $query = qq{SELECT bug_id 
                          FROM bugs $from_product 
                         WHERE bugs.creation_ts < } . 
                         $dbh->sql_from_days($day - 1) . 
                         q{ AND bugs.creation_ts >= } . 
                         $dbh->sql_from_days($day - 2) . 
                        $and_product . q{ ORDER BY bug_id};

            my $bug_ids = $dbh->selectcol_arrayref($query, undef, @values);
            push(@bugs, @$bug_ids);
370 371

            my %bugcount;
372 373
            foreach (@statuses) { $bugcount{$_} = 0; }
            foreach (@resolutions) { $bugcount{$_} = 0; }
374
            # Get information on bug states and resolutions.
375
            for my $bug (@bugs) {
376 377 378
                my $status = _get_value(
                    $removed->{'bug_status'}->{$bug},
                    $bug_status,  $day, $bug);
379

380 381 382
                if (defined $bugcount{$status}) {
                    $bugcount{$status}++;
                }
383 384 385 386 387

                my $resolution = _get_value(
                    $removed->{'resolution'}->{$bug},
                    $bug_resolution, $day, $bug);

388 389
                if (defined $bugcount{$resolution}) {
                    $bugcount{$resolution}++;
390 391 392 393 394 395 396
                }
            }

            # Generate a line of output containing the date and counts
            # of bugs in each state.
            my $date = sqlday($day, $base);
            print DATA "$date";
397 398
            foreach (@statuses) { print DATA "|$bugcount{$_}"; }
            foreach (@resolutions) { print DATA "|$bugcount{$_}"; }
399 400 401 402 403 404 405 406 407 408 409 410 411
            print DATA "\n";
        }
        
        # Finish up output feedback for this product.
        my $tend = time;
        print "\rRegenerating $product \[100.0\%] - " .
            delta_time($tstart, $tend) . "\n";
            
        close DATA;
        chmod 0640, $file;
    }
}

412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429
# A helper for --regenerate.
# For each bug that exists on a day, we determine its status/resolution
# at the beginning of the day.  If there were no status/resolution
# changes on or after that day, the status was the same as it
# is today (the "current" value).  Otherwise, the status was equal to the
# first "previous value" entry in the bugs_activity table for that 
# bug made on or after that day.
sub _get_value {
    my ($removed, $current, $day, $bug) = @_;

    # Get the first change that's on or after this day.
    my $item = first { $_->{when} >= $day } @{ $removed || [] };

    # If there's no change on or after this day, then we just return the
    # current value.
    return $item ? $item->{removed} : $current->{$bug};
}

430 431 432 433
sub today {
    my ($dom, $mon, $year) = (localtime(time))[3, 4, 5];
    return sprintf "%04d%02d%02d", 1900 + $year, ++$mon, $dom;
}
434

435 436 437 438 439
sub today_dash {
    my ($dom, $mon, $year) = (localtime(time))[3, 4, 5];
    return sprintf "%04d-%02d-%02d", 1900 + $year, ++$mon, $dom;
}

440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455
sub sqlday {
    my ($day, $base) = @_;
    $day = ($day - $base) * 86400;
    my ($dom, $mon, $year) = (gmtime($day))[3, 4, 5];
    return sprintf "%04d%02d%02d", 1900 + $year, ++$mon, $dom;
}

sub delta_time {
    my $tstart = shift;
    my $tend = shift;
    my $delta = $tend - $tstart;
    my $hours = int($delta/3600);
    my $minutes = int($delta/60) - ($hours * 60);
    my $seconds = $delta - ($minutes * 60) - ($hours * 3600);
    return sprintf("%02d:%02d:%02d" , $hours, $minutes, $seconds);
}
456 457 458 459 460 461 462 463 464 465

sub CollectSeriesData {
    # We need some way of randomising the distribution of series, such that
    # all of the series which are to be run every 7 days don't run on the same
    # day. This is because this might put the server under severe load if a
    # particular frequency, such as once a week, is very common. We achieve
    # this by only running queries when:
    # (days_since_epoch + series_id) % frequency = 0. So they'll run every
    # <frequency> days, but the start date depends on the series_id.
    my $days_since_epoch = int(time() / (60 * 60 * 24));
466
    my $today = $ARGV[0] || today_dash();
467

468 469
    # We save a copy of the main $dbh and then switch to the shadow and get
    # that one too. Remember, these may be the same.
470 471
    my $dbh = Bugzilla->switch_to_main_db();
    my $shadow_dbh = Bugzilla->switch_to_shadow_db();
472
    
473
    my $serieses = $dbh->selectall_hashref("SELECT series_id, query, creator " .
474 475
                      "FROM series " .
                      "WHERE frequency != 0 AND " . 
476
                      "MOD(($days_since_epoch + series_id), frequency) = 0",
477 478 479 480
                      "series_id");

    # We prepare the insertion into the data table, for efficiency.
    my $sth = $dbh->prepare("INSERT INTO series_data " .
481
                            "(series_id, series_date, series_value) " .
482 483
                            "VALUES (?, " . $dbh->quote($today) . ", ?)");

484 485 486
    # We delete from the table beforehand, to avoid SQL errors if people run
    # collectstats.pl twice on the same day.
    my $deletesth = $dbh->prepare("DELETE FROM series_data 
487
                                   WHERE series_id = ? AND series_date = " .
488 489
                                   $dbh->quote($today));
                                     
490 491 492
    foreach my $series_id (keys %$serieses) {
        # We set up the user for Search.pm's permission checking - each series
        # runs with the permissions of its creator.
493
        my $user = new Bugzilla::User($serieses->{$series_id}->{'creator'});
494
        my $cgi = new Bugzilla::CGI($serieses->{$series_id}->{'query'});
495
        my $data;
496 497 498 499 500

        # Do not die if Search->new() detects invalid data, such as an obsolete
        # login name or a renamed product or component, etc.
        eval {
            my $search = new Bugzilla::Search('params' => $cgi,
501
                                              'fields' => ["bug_id"],
502 503
                                              'user'   => $user);
            my $sql = $search->getSQL();
504 505
            $data = $shadow_dbh->selectall_arrayref($sql);
        };
506

507 508 509 510
        if (!$@) {
            # We need to count the returned rows. Without subselects, we can't
            # do this directly in the SQL for all queries. So we do it by hand.
            my $count = scalar(@$data) || 0;
511

512 513 514
            $deletesth->execute($series_id);
            $sth->execute($series_id, $count);
        }
515 516 517
    }
}