Skip to content

Commit

Permalink
regcomp.c: Add shortcuts to some inversion list ops
Browse files Browse the repository at this point in the history
When taking the union or intersection of two inversion lists, not
infrequently the result is the same as one of the inputs.  An example is
the union of a set that contains all code points with any other set.
The result is the set that contains all code points.  Also, not
infrequently, the result is to overwrite one of the inputs.  If the
other input contributed nothing to the result, the whole operation is
effectively a no-op, as the result has the same contents as the input it
overwrites.

It turns out that it is cheap to keep track of if just one input
contributes to the result.  This commit does that, and short-circuits
the rest of the code if it is going to turn out to be a no-op.
  • Loading branch information
khwilliamson committed May 26, 2016
1 parent e67bc19 commit c29dfc6
Showing 1 changed file with 106 additions and 44 deletions.
150 changes: 106 additions & 44 deletions regcomp.c
Original file line number Diff line number Diff line change
Expand Up @@ -8931,13 +8931,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
* length there. The preface says to incorporate its examples into your
* code at your own risk.
*
* The algorithm is like a merge sort.
*
* XXX A potential performance improvement is to keep track as we go along
* if only one of the inputs contributes to the result, meaning the other
* is a subset of that one. In that case, we can skip the final copy and
* return the larger of the input lists, but then outside code might need
* to keep track of whether to free the input list or not */
* The algorithm is like a merge sort. */

const UV* array_a; /* a's array */
const UV* array_b;
Expand All @@ -8952,6 +8946,10 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
UV i_b = 0;
UV i_u = 0;

bool has_something_from_a = FALSE;
bool has_something_from_b = FALSE;


/* running count, as explained in the algorithm source book; items are
* stopped accumulating and are output when the count changes to/from 0.
* The count is incremented when we start a range that's in the set, and
Expand Down Expand Up @@ -9117,10 +9115,12 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
{
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
cp= array_a[i_a++];
has_something_from_a = TRUE;
}
else {
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
cp = array_b[i_b++];
has_something_from_b = TRUE;
}

/* Here, have chosen which of the two inputs to look at. Only output
Expand Down Expand Up @@ -9169,10 +9169,54 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
* be output. (If 'count' is non-zero, then the input list we exhausted
* has everything remaining up to the machine's limit in its set, and hence
* in the union, so there will be no further output. */
len_u = i_u;
if (count == 0) {
/* At most one of the subexpressions will be non-zero */
len_u += (len_a - i_a) + (len_b - i_b);
if (count != 0) {

/* Here, there is nothing left to put in the union. If the union came
* only from the input that it is to overwrite, this whole operation is
* a no-op */
if ( UNLIKELY(! has_something_from_b && *output == a)
|| UNLIKELY(! has_something_from_a && *output == b))
{
SvREFCNT_dec_NN(u);
return;
}

len_u = i_u;
}
else {
/* When 'count' is 0, the list that was exhausted (if one was shorter
* than the other) ended with everything above it not in its set. That
* means that the remaining part of the union is precisely the same as
* the non-exhausted list, so can just copy it unchanged. If only one
* of the inputs contributes to the union, and the output is to
* overwite that particular input, then this whole operation was a
* no-op. */

IV copy_count = len_a - i_a;
if (copy_count > 0) {
if (UNLIKELY(! has_something_from_b && *output == a)) {
SvREFCNT_dec_NN(u);
return;
}
Copy(array_a + i_a, array_u + i_u, copy_count, UV);
len_u = i_u + copy_count;
}
else if ((copy_count = len_b - i_b) > 0) {
if (UNLIKELY(! has_something_from_a && *output == b)) {
SvREFCNT_dec_NN(u);
return;
}
Copy(array_b + i_b, array_u + i_u, copy_count, UV);
len_u = i_u + copy_count;
} else if ( UNLIKELY(! has_something_from_b && *output == a)
|| UNLIKELY(! has_something_from_a && *output == b))
{
/* Here, both arrays are exhausted, so no need to do any additional
* copying. Also here, the union came only from the input that it is
* to overwrite, so this whole operation is a no-op */
SvREFCNT_dec_NN(u);
return;
}
}

/* Set the result to the final length, which can change the pointer to
Expand All @@ -9184,22 +9228,6 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
array_u = invlist_array(u);
}

/* When 'count' is 0, the list that was exhausted (if one was shorter than
* the other) ended with everything above it not in its set. That means
* that the remaining part of the union is precisely the same as the
* non-exhausted list, so can just copy it unchanged. (If both lists were
* exhausted at the same time, then the operations below will be both 0.)
*/
if (count == 0) {
IV copy_count; /* At most one will have a non-zero copy count */
if ((copy_count = len_a - i_a) > 0) {
Copy(array_a + i_a, array_u + i_u, copy_count, UV);
}
else if ((copy_count = len_b - i_b) > 0) {
Copy(array_b + i_b, array_u + i_u, copy_count, UV);
}
}

/* If the output is not to overwrite either of the inputs, just return the
* calculated union */
if (a != *output && b != *output) {
Expand Down Expand Up @@ -9272,6 +9300,9 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
*/
UV count = 0;

bool has_something_from_a = FALSE;
bool has_something_from_b = FALSE;

PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
assert(a != b);

Expand Down Expand Up @@ -9368,10 +9399,12 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
{
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
cp= array_a[i_a++];
has_something_from_a = TRUE;
}
else {
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
cp= array_b[i_b++];
has_something_from_b = TRUE;
}

/* Here, have chosen which of the two inputs to look at. Only output
Expand Down Expand Up @@ -9413,12 +9446,52 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
count++;
}

/* The final length is what we've output so far plus what else is in the
* intersection. At most one of the subexpressions below will be non-zero
* */
len_r = i_r;
if (count >= 2) {
len_r += (len_a - i_a) + (len_b - i_b);
if (count < 2) {

/* Here, there is nothing left to put in the intersection. If the
* intersection came only from the input that it is to overwrite, this
* whole operation is a no-op */
if ( UNLIKELY(! has_something_from_b && *i == a)
|| UNLIKELY(! has_something_from_a && *i == b))
{
SvREFCNT_dec_NN(r);
return;
}

len_r = i_r;
}
else {
/* When 'count' is 2 or more, the list that was exhausted, what remains
* in the intersection is precisely the same as the non-exhausted list,
* so can just copy it unchanged. If only one of the inputs
* contributes to the intersection, and the output is to overwite that
* particular input, then this whole operation was a no-op. */

IV copy_count = len_a - i_a;
if (copy_count > 0) {
if (UNLIKELY(! has_something_from_b && *i == a)) {
SvREFCNT_dec_NN(r);
return;
}
Copy(array_a + i_a, array_r + i_r, copy_count, UV);
len_r = i_r + copy_count;
}
else if ((copy_count = len_b - i_b) > 0) {
if (UNLIKELY(! has_something_from_a && *i == b)) {
SvREFCNT_dec_NN(r);
return;
}
Copy(array_b + i_b, array_r + i_r, copy_count, UV);
len_r = i_r + copy_count;
} else if ( UNLIKELY(! has_something_from_b && *i == a)
|| UNLIKELY(! has_something_from_a && *i == b))
{
/* Here, both arrays are exhausted, so no need to do any additional
* copying. Also here, the intersection came only from the input
* that it is to overwrite, so this whole operation is a no-op */
SvREFCNT_dec_NN(r);
return;
}
}

/* Set the result to the final length, which can change the pointer to
Expand All @@ -9430,17 +9503,6 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
array_r = invlist_array(r);
}

/* Finish outputting any remaining */
if (count >= 2) { /* At most one will have a non-zero copy count */
IV copy_count;
if ((copy_count = len_a - i_a) > 0) {
Copy(array_a + i_a, array_r + i_r, copy_count, UV);
}
else if ((copy_count = len_b - i_b) > 0) {
Copy(array_b + i_b, array_r + i_r, copy_count, UV);
}
}

/* If the output is not to overwrite either of the inputs, just return the
* calculated intersection */
if (a != *i && b != *i) {
Expand Down

0 comments on commit c29dfc6

Please sign in to comment.