Bug 703788: Improve performance of diff_arrays() with large arrays

r/a=mkanat

Bug 703788: Improve performance of diff_arrays() with large arrays
b4e1dbdc · Frédéric Buclin · 8e99c55e · b4e1dbdc · b4e1dbdc
Commit b4e1dbdc authored Nov 19, 2011 by Frédéric Buclin
Show whitespace changes
Inline Side-by-side

Showing with 41 additions and 25 deletions

Util.pm Bugzilla/Util.pm +26 -23

007util.t t/007util.t +15 -2

No files found.
--- a/Bugzilla/Util.pm
+++ b/Bugzilla/Util.pm
@@ -305,36 +305,39 @@ sub use_attachbase {
 sub diff_arrays {
    my ($old_ref, $new_ref, $attrib) = @_;
-    my @old = @$old_ref;
-    my @new = @$new_ref;
    $attrib ||= 'name';
-    # For each pair of (old, new) entries:
+    my (%counts, %pos);
-    # If object arrays were passed then an attribute should be defined;
+    # We are going to alter the old array.
-    # If they're equal, set them to empty. When done, @old contains entries
+    my @old = @$old_ref;
-    # that were removed; @new contains ones that got added.
+    my $i = 0;
-    foreach my $oldv (@old) {
-        foreach my $newv (@new) {
+    # $counts{foo}-- means old, $counts{foo}++ means new.
-            next if ($newv eq '' or $oldv eq '');
+    # If $counts{foo} becomes positive, then we are adding new items,
-            if (blessed($oldv) and blessed($newv)) {
+    # else we simply cancel one old existing item. Remaining items
-                if ($oldv->$attrib eq $newv->$attrib) {
+    # in the old list have been removed.
-                    $newv = $oldv = '';
+    foreach (@old) {
+        next unless defined $_;
+        my $value = blessed($_) ? $_->$attrib : $_;
+        $counts{$value}--;
+        push @{$pos{$value}}, $i++;
    }
+    my @added;
+    foreach (@$new_ref) {
+        next unless defined $_;
+        my $value = blessed($_) ? $_->$attrib : $_;
+        if (++$counts{$value} > 0) {
+            # Ignore empty strings, but objects having an empty string
+            # as attribute are fine.
+            push(@added, $_) unless ($value eq '' && !blessed($_));
        }
        else {
-                if ($oldv eq $newv) {
+            my $old_pos = shift @{$pos{$value}};
-                    $newv = $oldv = ''
+            $old[$old_pos] = undef;
-                }
        }
    }
-    }
+    # Ignore cancelled items as well as empty strings.
+    my @removed = grep { defined $_ && $_ ne '' } @old;
-    my @removed;
-    my @added;
-    @removed = grep { $_ ne '' } @old;
-    @added   = grep { $_ ne '' } @new;
    return (\@removed, \@added);
 }

--- a/t/007util.t
+++ b/t/007util.t
@@ -18,7 +18,7 @@
 # 
 # Contributor(s): Zach Lipton <zach@zachlipton.com>
 #                 Max Kanat-Alexander <mkanat@bugzilla.org>
+#                 Frédéric Buclin <LpSolit@gmail.com>
 #################
 #Bugzilla Test 7#
@@ -26,7 +26,7 @@
 use lib 't';
 use Support::Files;
-use Test::More tests => 13;
+use Test::More tests => 15;
 BEGIN { 
    use_ok(Bugzilla);
@@ -72,3 +72,16 @@ foreach my $input (keys %email_strings) {
    is(Bugzilla::Util::email_filter($input), $email_strings{$input}, 
       "email_filter('$input')");
 }
+# diff_arrays():
+my @old_array = qw(alpha beta alpha gamma gamma beta alpha delta epsilon gamma);
+my @new_array = qw(alpha alpha beta gamma epsilon delta beta delta);
+# The order is not relevant when comparing both arrays for matching items,
+# i.e. (foo bar) and (bar foo) are the same arrays (same items).
+# But when returning data, we try to respect the initial order.
+# We remove the leftmost items first, and return what's left. This means:
+# Removed (in this order): gamma alpha gamma.
+# Added (in this order): delta
+my ($removed, $added) = diff_arrays(\@old_array, \@new_array);
+is_deeply($removed, [qw(gamma alpha gamma)], 'diff_array(\@old, \@new) (check removal)');
+is_deeply($added, [qw(delta)], 'diff_array(\@old, \@new) (check addition)');