Browse Source

Avoid all needless memory copies when computing consensus diffs.

Previously, we operated on smartlists of NUL-terminated strings,
which required us to copy both inputs to produce the NUL-terminated
strings.  Then we copied parts of _those_ inputs to produce an
output smartlist of NUL-terminated strings.  And finally, we
concatenated everything into a final resulting string.

This implementation, instead, uses a pointer-and-extent pattern to
represent each line as a pointer into the original inputs and a
length.  These line objects are then added by reference into the
output. No actual bytes are copied from the original strings until
we finally concatenate the final result together.

Bookkeeping structures and newly allocated strings (like ed
commands) are allocated inside a memarea, to avoid needless mallocs
or complicated should-I-free-this-or-not bookkeeping.

In my measurements, this improves CPU performance by something like
18%.  The memory savings should be much, much higher.
Nick Mathewson 7 years ago
parent
commit
d8c129a11a
4 changed files with 495 additions and 351 deletions
  1. 216 121
      src/or/consdiff.c
  2. 24 6
      src/or/consdiff.h
  3. 22 0
      src/test/bench.c
  4. 233 224
      src/test/test_consdiff.c

+ 216 - 121
src/or/consdiff.c

@@ -28,12 +28,19 @@
  * and will send small couples of slices to calc_changes, keeping the running
  * time near-linear. This is explained in more detail in the gen_ed_diff
  * comments.
+ *
+ * The allocation strategy tries to save time and memory by avoiding needless
+ * copies.  Instead of actually splitting the inputs into separate strings, we
+ * allocate cdline_t objects, each of which represents a line in the original
+ * object or in the output.  We use memarea_t allocators to manage the
+ * temporary memory we use when generating or applying diffs.
  **/
 
 #define CONSDIFF_PRIVATE
 
 #include "or.h"
 #include "consdiff.h"
+#include "memarea.h"
 #include "routerparse.h"
 
 static const char* ns_diff_version = "network-status-diff-version 1";
@@ -41,6 +48,36 @@ static const char* hash_token = "hash";
 
 static char *consensus_join_lines(const smartlist_t *inp);
 
+/** Return true iff a and b have the same contents. */
+STATIC int
+lines_eq(const cdline_t *a, const cdline_t *b)
+{
+  return a->len == b->len && fast_memeq(a->s, b->s, a->len);
+}
+
+/** Return true iff a has the same contents as the nul-terminated string b. */
+STATIC int
+line_str_eq(const cdline_t *a, const char *b)
+{
+  const size_t len = strlen(b);
+  tor_assert(len <= UINT32_MAX);
+  cdline_t bline = { b, (uint32_t)len };
+  return lines_eq(a, &bline);
+}
+
+/** Add a cdline_t to <b>lst</b> holding as its contents the nul-terminated
+ * string s.  Use the provided memory area for storage. */
+STATIC void
+smartlist_add_linecpy(smartlist_t *lst, memarea_t *area, const char *s)
+{
+  size_t len = strlen(s);
+  const char *ss = memarea_memdup(area, s, len);
+  cdline_t *line = memarea_alloc(area, sizeof(cdline_t));
+  line->s = ss;
+  line->len = (uint32_t)len;
+  smartlist_add(lst, line);
+}
+
 /** Compute the digest of <b>cons</b>, and store the result in
  * <b>digest_out</b>. Return 0 on success, -1 on failure. */
 /* This is a separate, mockable function so that we can override it when
@@ -114,7 +151,7 @@ lcs_lengths(const smartlist_slice_t *slice1, const smartlist_slice_t *slice2,
 
   for (int i = 0; i < slice1->len; ++i, si+=direction) {
 
-    const char *line1 = smartlist_get(slice1->list, si);
+    const cdline_t *line1 = smartlist_get(slice1->list, si);
     /* Store the last results. */
     memcpy(prev, result, a_size);
 
@@ -125,8 +162,8 @@ lcs_lengths(const smartlist_slice_t *slice1, const smartlist_slice_t *slice2,
 
     for (int j = 0; j < slice2->len; ++j, sj+=direction) {
 
-      const char *line2 = smartlist_get(slice2->list, sj);
-      if (!strcmp(line1, line2)) {
+      const cdline_t *line2 = smartlist_get(slice2->list, sj);
+      if (lines_eq(line1, line2)) {
         /* If the lines are equal, the lcs is one line longer. */
         result[j + 1] = prev[j] + 1;
       } else {
@@ -146,9 +183,9 @@ STATIC void
 trim_slices(smartlist_slice_t *slice1, smartlist_slice_t *slice2)
 {
   while (slice1->len>0 && slice2->len>0) {
-    const char *line1 = smartlist_get(slice1->list, slice1->offset);
-    const char *line2 = smartlist_get(slice2->list, slice2->offset);
-    if (strcmp(line1, line2)) {
+    const cdline_t *line1 = smartlist_get(slice1->list, slice1->offset);
+    const cdline_t *line2 = smartlist_get(slice2->list, slice2->offset);
+    if (!lines_eq(line1, line2)) {
       break;
     }
     slice1->offset++; slice1->len--;
@@ -159,9 +196,9 @@ trim_slices(smartlist_slice_t *slice1, smartlist_slice_t *slice2)
   int i2 = (slice2->offset+slice2->len)-1;
 
   while (slice1->len>0 && slice2->len>0) {
-    const char *line1 = smartlist_get(slice1->list, i1);
-    const char *line2 = smartlist_get(slice2->list, i2);
-    if (strcmp(line1, line2)) {
+    const cdline_t *line1 = smartlist_get(slice1->list, i1);
+    const cdline_t *line2 = smartlist_get(slice2->list, i2);
+    if (!lines_eq(line1, line2)) {
       break;
     }
     i1--;
@@ -171,15 +208,17 @@ trim_slices(smartlist_slice_t *slice1, smartlist_slice_t *slice2)
   }
 }
 
-/** Like smartlist_string_pos, but limited to the bounds of the slice.
+/** Like smartlist_string_pos, but uses a cdline_t, and is restricted to the
+ * bounds of the slice.
  */
 STATIC int
-smartlist_slice_string_pos(const smartlist_slice_t *slice, const char *string)
+smartlist_slice_string_pos(const smartlist_slice_t *slice,
+                           const cdline_t *string)
 {
   int end = slice->offset + slice->len;
   for (int i = slice->offset; i < end; ++i) {
-    const char *el = smartlist_get(slice->list, i);
-    if (!strcmp(el, string)) {
+    const cdline_t *el = smartlist_get(slice->list, i);
+    if (lines_eq(el, string)) {
       return i;
     }
   }
@@ -199,7 +238,7 @@ set_changed(bitarray_t *changed1, bitarray_t *changed2,
   tor_assert(slice1->len == 0 || slice1->len == 1);
 
   if (slice1->len == 1) {
-    const char *line_common = smartlist_get(slice1->list, slice1->offset);
+    const cdline_t *line_common = smartlist_get(slice1->list, slice1->offset);
     toskip = smartlist_slice_string_pos(slice2, line_common);
     if (toskip == -1) {
       bitarray_set(changed1, slice1->offset);
@@ -318,16 +357,20 @@ static const uint8_t base64_compare_table[256] = {
 
 /** Helper: Get the identity hash from a router line, assuming that the line
  * at least appears to be a router line and thus starts with "r ".
+ *
+ * If an identity hash is found, store it (without decoding it) in
+ * <b>hash_out</b>, and return 0.  On failure, return -1.
  */
-STATIC const char *
-get_id_hash(const char *r_line)
+STATIC int
+get_id_hash(const cdline_t *line, cdline_t *hash_out)
 {
-  r_line += strlen("r ");
+  if (line->len < 2)
+    return -1;
 
   /* Skip the router name. */
-  const char *hash = strchr(r_line, ' ');
+  const char *hash = memchr(line->s + 2, ' ', line->len - 2);
   if (!hash) {
-    return NULL;
+    return -1;
   }
 
   hash++;
@@ -335,28 +378,34 @@ get_id_hash(const char *r_line)
   /* Stop when the first non-base64 character is found. Use unsigned chars to
    * avoid negative indexes causing crashes.
    */
-  while (base64_compare_table[*((unsigned char*)hash_end)] != X) {
+  while (base64_compare_table[*((unsigned char*)hash_end)] != X &&
+         hash_end < line->s + line->len) {
     hash_end++;
   }
 
   /* Empty hash. */
   if (hash_end == hash) {
-    return NULL;
+    return -1;
   }
 
-  return hash;
+  hash_out->s = hash;
+  /* Always true because lines are limited to this length */
+  tor_assert(hash_end - hash <= UINT32_MAX);
+  hash_out->len = (uint32_t)(hash_end - hash);
+
+  return 0;
 }
 
 /** Helper: Check that a line is a valid router entry. We must at least be
  * able to fetch a proper identity hash from it for it to be valid.
  */
 STATIC int
-is_valid_router_entry(const char *line)
+is_valid_router_entry(const cdline_t *line)
 {
-  if (strcmpstart(line, "r ") != 0) {
+  if (line->len < 2 || fast_memneq(line->s, "r ", 2))
     return 0;
-  }
-  return (get_id_hash(line) != NULL);
+  cdline_t tmp;
+  return (get_id_hash(line, &tmp) == 0);
 }
 
 /** Helper: Find the next router line starting at the current position.
@@ -374,7 +423,7 @@ next_router(const smartlist_t *cons, int cur)
     return len;
   }
 
-  const char *line = smartlist_get(cons, cur);
+  const cdline_t *line = smartlist_get(cons, cur);
   while (!is_valid_router_entry(line)) {
     if (++cur >= len) {
       return len;
@@ -384,26 +433,28 @@ next_router(const smartlist_t *cons, int cur)
   return cur;
 }
 
-/** Helper: compare two base64-encoded identity hashes which may be of
+/** Helper: compare two base64-encoded identity hashes, which may be of
  * different lengths. Comparison ends when the first non-base64 char is found.
  */
 STATIC int
-base64cmp(const char *hash1, const char *hash2)
+base64cmp(const cdline_t *hash1, const cdline_t *hash2)
 {
   /* NULL is always lower, useful for last_hash which starts at NULL. */
-  if (!hash1 && !hash2) {
+  if (!hash1->s && !hash2->s) {
     return 0;
   }
-  if (!hash1) {
+  if (!hash1->s) {
     return -1;
   }
-  if (!hash2) {
+  if (!hash2->s) {
     return 1;
   }
 
   /* Don't index with a char; char may be signed. */
-  const unsigned char *a = (unsigned char*)hash1;
-  const unsigned char *b = (unsigned char*)hash2;
+  const unsigned char *a = (unsigned char*)hash1->s;
+  const unsigned char *b = (unsigned char*)hash2->s;
+  const unsigned char *a_end = a + hash1->len;
+  const unsigned char *b_end = b + hash2->len;
   while (1) {
     uint8_t av = base64_compare_table[*a];
     uint8_t bv = base64_compare_table[*b];
@@ -427,6 +478,15 @@ base64cmp(const char *hash1, const char *hash2)
     } else {
       a++;
       b++;
+      if (a == a_end) {
+        if (b == b_end) {
+          return 0;
+        } else {
+          return -1;
+        }
+      } else if (b == b_end) {
+        return 1;
+      }
     }
   }
 }
@@ -436,6 +496,9 @@ base64cmp(const char *hash1, const char *hash2)
  * happen if any lines the script had to add matched "." or if the routers
  * were not properly ordered.
  *
+ * All cdline_t objects in the resulting object are either references to lines
+ * in one of the inputs, or are newly allocated lines in the provided memarea.
+ *
  * This implementation is consensus-specific. To generate an ed diff for any
  * given input in quadratic time, you can replace all the code until the
  * navigation in reverse order with the following:
@@ -449,7 +512,8 @@ base64cmp(const char *hash1, const char *hash2)
  *   calc_changes(cons1_sl, cons2_sl, changed1, changed2);
  */
 STATIC smartlist_t *
-gen_ed_diff(const smartlist_t *cons1, const smartlist_t *cons2)
+gen_ed_diff(const smartlist_t *cons1, const smartlist_t *cons2,
+            memarea_t *area)
 {
   int len1 = smartlist_len(cons1);
   int len2 = smartlist_len(cons2);
@@ -463,12 +527,9 @@ gen_ed_diff(const smartlist_t *cons1, const smartlist_t *cons2)
   int i1=-1, i2=-1;
   int start1=0, start2=0;
 
-  const char *hash1 = NULL;
-  const char *hash2 = NULL;
-
   /* To check that hashes are ordered properly */
-  const char *last_hash1 = NULL;
-  const char *last_hash2 = NULL;
+  cdline_t hash1 = { NULL, 0 }, hash2 =  { NULL, 0 };
+  cdline_t last_hash1 = { NULL, 0 }, last_hash2 = { NULL, 0 };
 
   /* i1 and i2 are initialized at the first line of each consensus. They never
    * reach past len1 and len2 respectively, since next_router doesn't let that
@@ -485,9 +546,9 @@ gen_ed_diff(const smartlist_t *cons1, const smartlist_t *cons2)
     if (i1 < len1) {
       i1 = next_router(cons1, i1);
       if (i1 != len1) {
-        last_hash1 = hash1;
-        hash1 = get_id_hash(smartlist_get(cons1, i1));
-        if (base64cmp(hash1, last_hash1) <= 0) {
+        memcpy(&last_hash1, &hash1, sizeof(cdline_t));
+        if (get_id_hash(smartlist_get(cons1, i1), &hash1) < 0 ||
+            base64cmp(&hash1, &last_hash1) <= 0) {
           log_warn(LD_CONSDIFF, "Refusing to generate consensus diff because "
               "the base consensus doesn't have its router entries "
               "sorted properly.");
@@ -499,9 +560,9 @@ gen_ed_diff(const smartlist_t *cons1, const smartlist_t *cons2)
     if (i2 < len2) {
       i2 = next_router(cons2, i2);
       if (i2 != len2) {
-        last_hash2 = hash2;
-        hash2 = get_id_hash(smartlist_get(cons2, i2));
-        if (base64cmp(hash2, last_hash2) <= 0) {
+        memcpy(&last_hash2, &hash2, sizeof(cdline_t));
+        if (get_id_hash(smartlist_get(cons2, i2), &hash2) < 0 ||
+            base64cmp(&hash2, &last_hash2) <= 0) {
           log_warn(LD_CONSDIFF, "Refusing to generate consensus diff because "
               "the target consensus doesn't have its router entries "
               "sorted properly.");
@@ -525,7 +586,7 @@ gen_ed_diff(const smartlist_t *cons1, const smartlist_t *cons2)
        * consensus has already reached the end, both are extended to their
        * respecting ends since we are done.
        */
-      int cmp = base64cmp(hash1, hash2);
+      int cmp = base64cmp(&hash1, &hash2);
       while (cmp != 0) {
         if (i1 < len1 && cmp < 0) {
           i1 = next_router(cons1, i1);
@@ -536,9 +597,9 @@ gen_ed_diff(const smartlist_t *cons1, const smartlist_t *cons2)
             i2 = len2;
             break;
           }
-          last_hash1 = hash1;
-          hash1 = get_id_hash(smartlist_get(cons1, i1));
-          if (base64cmp(hash1, last_hash1) <= 0) {
+          memcpy(&last_hash1, &hash1, sizeof(cdline_t));
+          if (get_id_hash(smartlist_get(cons1, i1), &hash1) < 0 ||
+              base64cmp(&hash1, &last_hash1) <= 0) {
             log_warn(LD_CONSDIFF, "Refusing to generate consensus diff "
                 "because the base consensus doesn't have its router entries "
                 "sorted properly.");
@@ -553,9 +614,9 @@ gen_ed_diff(const smartlist_t *cons1, const smartlist_t *cons2)
             i1 = len1;
             break;
           }
-          last_hash2 = hash2;
-          hash2 = get_id_hash(smartlist_get(cons2, i2));
-          if (base64cmp(hash2, last_hash2) <= 0) {
+          memcpy(&last_hash2, &hash2, sizeof(cdline_t));
+          if (get_id_hash(smartlist_get(cons2, i2), &hash2) < 0 ||
+              base64cmp(&hash2, &last_hash2) <= 0) {
             log_warn(LD_CONSDIFF, "Refusing to generate consensus diff "
                 "because the target consensus doesn't have its router entries "
                 "sorted properly.");
@@ -566,7 +627,7 @@ gen_ed_diff(const smartlist_t *cons1, const smartlist_t *cons2)
           i2 = len2;
           break;
         }
-        cmp = base64cmp(hash1, hash2);
+        cmp = base64cmp(&hash1, &hash2);
       }
     }
 
@@ -595,6 +656,7 @@ gen_ed_diff(const smartlist_t *cons1, const smartlist_t *cons2)
    * each chunk of changes.
    */
   i1=len1-1, i2=len2-1;
+  char buf[128];
   while (i1 > 0 || i2 > 0) {
 
     int start1x, start2x, end1, end2, added, deleted;
@@ -626,31 +688,36 @@ gen_ed_diff(const smartlist_t *cons1, const smartlist_t *cons2)
 
     if (added == 0) {
       if (deleted == 1) {
-        smartlist_add_asprintf(result, "%id", start1x+1);
+        tor_snprintf(buf, sizeof(buf), "%id", start1x+1);
+        smartlist_add_linecpy(result, area, buf);
       } else {
-        smartlist_add_asprintf(result, "%i,%id", start1x+1, start1x+deleted);
+        tor_snprintf(buf, sizeof(buf), "%i,%id", start1x+1, start1x+deleted);
+        smartlist_add_linecpy(result, area, buf);
       }
 
     } else {
       int i;
       if (deleted == 0) {
-        smartlist_add_asprintf(result, "%ia", start1x);
+        tor_snprintf(buf, sizeof(buf), "%ia", start1x);
+        smartlist_add_linecpy(result, area, buf);
       } else if (deleted == 1) {
-        smartlist_add_asprintf(result, "%ic", start1x+1);
+        tor_snprintf(buf, sizeof(buf), "%ic", start1x+1);
+        smartlist_add_linecpy(result, area, buf);
       } else {
-        smartlist_add_asprintf(result, "%i,%ic", start1x+1, start1x+deleted);
+        tor_snprintf(buf, sizeof(buf), "%i,%ic", start1x+1, start1x+deleted);
+        smartlist_add_linecpy(result, area, buf);
       }
 
       for (i = start2x; i <= end2; ++i) {
-        const char *line = smartlist_get(cons2, i);
-        if (!strcmp(line, ".")) {
+        cdline_t *line = smartlist_get(cons2, i);
+        if (line_str_eq(line, ".")) {
           log_warn(LD_CONSDIFF, "Cannot generate consensus diff because "
               "one of the lines to be added is \".\".");
           goto error_cleanup;
         }
-        smartlist_add(result, tor_strdup(line));
+        smartlist_add(result, line);
       }
-      smartlist_add_asprintf(result, ".");
+      smartlist_add_linecpy(result, area, ".");
     }
   }
 
@@ -664,7 +731,6 @@ gen_ed_diff(const smartlist_t *cons1, const smartlist_t *cons2)
   bitarray_free(changed1);
   bitarray_free(changed2);
 
-  SMARTLIST_FOREACH(result, char*, line, tor_free(line));
   smartlist_free(result);
 
   return NULL;
@@ -673,6 +739,9 @@ gen_ed_diff(const smartlist_t *cons1, const smartlist_t *cons2)
 /** Apply the ed diff, starting at <b>diff_starting_line</b>, to the consensus
  * and return a new consensus, also as a line-based smartlist. Will return
  * NULL if the ed diff is not properly formatted.
+ *
+ * All cdline_t objects in the resulting object are references to lines
+ * in one of the inputs; nothing is copied.
  */
 STATIC smartlist_t *
 apply_ed_diff(const smartlist_t *cons1, const smartlist_t *diff,
@@ -683,9 +752,17 @@ apply_ed_diff(const smartlist_t *cons1, const smartlist_t *diff,
   smartlist_t *cons2 = smartlist_new();
 
   for (int i=diff_starting_line; i<diff_len; ++i) {
-    const char *diff_line = smartlist_get(diff, i);
+    const cdline_t *diff_cdline = smartlist_get(diff, i);
+    char diff_line[128];
     char *endptr1, *endptr2;
 
+    if (diff_cdline->len > sizeof(diff_line) - 1) {
+      log_warn(LD_CONSDIFF, "Could not apply consensus diff because "
+               "an ed command was far too long");
+      goto error_cleanup;
+    }
+    memcpy(diff_line, diff_cdline->s, diff_cdline->len);
+    diff_line[diff_cdline->len] = 0;
     int start = (int)strtol(diff_line, &endptr1, 10);
     int end;
     if (endptr1 == diff_line) {
@@ -748,8 +825,8 @@ apply_ed_diff(const smartlist_t *cons1, const smartlist_t *diff,
 
     /* Add unchanged lines. */
     for (; j && j > end; --j) {
-      const char *cons_line = smartlist_get(cons1, j-1);
-      smartlist_add(cons2, tor_strdup(cons_line));
+      cdline_t *cons_line = smartlist_get(cons1, j-1);
+      smartlist_add(cons2, cons_line);
     }
 
     /* Ignore removed lines. */
@@ -767,7 +844,7 @@ apply_ed_diff(const smartlist_t *cons1, const smartlist_t *diff,
 
       i++; /* Skip the line with the range and command. */
       while (i < diff_len) {
-        if (!strcmp(smartlist_get(diff, i), ".")) {
+        if (line_str_eq(smartlist_get(diff, i), ".")) {
           break;
         }
         if (++i == diff_len) {
@@ -787,16 +864,16 @@ apply_ed_diff(const smartlist_t *cons1, const smartlist_t *diff,
       }
 
       while (added_i > added_end) {
-        const char *added_line = smartlist_get(diff, added_i--);
-        smartlist_add(cons2, tor_strdup(added_line));
+        cdline_t *added_line = smartlist_get(diff, added_i--);
+        smartlist_add(cons2, added_line);
       }
     }
   }
 
   /* Add remaining unchanged lines. */
   for (; j > 0; --j) {
-    const char *cons_line = smartlist_get(cons1, j-1);
-    smartlist_add(cons2, tor_strdup(cons_line));
+    cdline_t *cons_line = smartlist_get(cons1, j-1);
+    smartlist_add(cons2, cons_line);
   }
 
   /* Reverse the whole thing since we did it from the end. */
@@ -805,7 +882,6 @@ apply_ed_diff(const smartlist_t *cons1, const smartlist_t *diff,
 
   error_cleanup:
 
-  SMARTLIST_FOREACH(cons2, char*, line, tor_free(line));
   smartlist_free(cons2);
 
   return NULL;
@@ -817,11 +893,13 @@ apply_ed_diff(const smartlist_t *cons1, const smartlist_t *diff,
  * up to the caller to free their resources.
  */
 smartlist_t *
-consdiff_gen_diff(const smartlist_t *cons1, const smartlist_t *cons2,
+consdiff_gen_diff(const smartlist_t *cons1,
+                  const smartlist_t *cons2,
                   const consensus_digest_t *digests1,
-                  const consensus_digest_t *digests2)
+                  const consensus_digest_t *digests2,
+                  memarea_t *area)
 {
-  smartlist_t *ed_diff = gen_ed_diff(cons1, cons2);
+  smartlist_t *ed_diff = gen_ed_diff(cons1, cons2, area);
   /* ed diff could not be generated - reason already logged by gen_ed_diff. */
   if (!ed_diff) {
     goto error_cleanup;
@@ -838,8 +916,18 @@ consdiff_gen_diff(const smartlist_t *cons1, const smartlist_t *cons2,
     /* LCOV_EXCL_STOP */
   }
 
-  int cons2_eq = smartlist_strings_eq(cons2, ed_cons2);
-  SMARTLIST_FOREACH(ed_cons2, char*, line, tor_free(line));
+  int cons2_eq = 1;
+  if (smartlist_len(cons2) == smartlist_len(ed_cons2)) {
+    SMARTLIST_FOREACH_BEGIN(cons2, const cdline_t *, line1) {
+      const cdline_t *line2 = smartlist_get(ed_cons2, line1_sl_idx);
+      if (! lines_eq(line1, line2) ) {
+        cons2_eq = 0;
+        break;
+      }
+    } SMARTLIST_FOREACH_END(line1);
+  } else {
+    cons2_eq = 0;
+  }
   smartlist_free(ed_cons2);
   if (!cons2_eq) {
     /* LCOV_EXCL_START -- impossible if diff generation is correct. */
@@ -858,10 +946,13 @@ consdiff_gen_diff(const smartlist_t *cons1, const smartlist_t *cons2,
                 (const char*)digests2->sha3_256, DIGEST256_LEN);
 
   /* Create the resulting consensus diff. */
+  char buf[160];
   smartlist_t *result = smartlist_new();
-  smartlist_add_asprintf(result, "%s", ns_diff_version);
-  smartlist_add_asprintf(result, "%s %s %s", hash_token,
+  tor_snprintf(buf, sizeof(buf), "%s", ns_diff_version);
+  smartlist_add_linecpy(result, area, buf);
+  tor_snprintf(buf, sizeof(buf), "%s %s %s", hash_token,
       cons1_hash_hex, cons2_hash_hex);
+  smartlist_add_linecpy(result, area, buf);
   smartlist_add_all(result, ed_diff);
   smartlist_free(ed_diff);
   return result;
@@ -870,7 +961,6 @@ consdiff_gen_diff(const smartlist_t *cons1, const smartlist_t *cons2,
 
   if (ed_diff) {
     /* LCOV_EXCL_START -- ed_diff is NULL except in unreachable cases above */
-    SMARTLIST_FOREACH(ed_diff, char *, cp, tor_free(cp));
     smartlist_free(ed_diff);
     /* LCOV_EXCL_STOP */
   }
@@ -888,7 +978,7 @@ consdiff_get_digests(const smartlist_t *diff,
                      char *digest2_out)
 {
   smartlist_t *hash_words = NULL;
-  const char *format;
+  const cdline_t *format;
   char cons1_hash[DIGEST256_LEN], cons2_hash[DIGEST256_LEN];
   char *cons1_hash_hex, *cons2_hash_hex;
   if (smartlist_len(diff) < 2) {
@@ -898,14 +988,19 @@ consdiff_get_digests(const smartlist_t *diff,
 
   /* Check that it's the format and version we know. */
   format = smartlist_get(diff, 0);
-  if (strcmp(format, ns_diff_version)) {
+  if (!line_str_eq(format, ns_diff_version)) {
     log_warn(LD_CONSDIFF, "The provided consensus diff format is not known.");
     goto error_cleanup;
   }
 
   /* Grab the base16 digests. */
   hash_words = smartlist_new();
-  smartlist_split_string(hash_words, smartlist_get(diff, 1), " ", 0, 0);
+  {
+    const cdline_t *line2 = smartlist_get(diff, 1);
+    char *h = tor_memdup_nulterm(line2->s, line2->len);
+    smartlist_split_string(hash_words, h, " ", 0, 0);
+    tor_free(h);
+  }
 
   /* There have to be three words, the first of which must be hash_token. */
   if (smartlist_len(hash_words) != 3 ||
@@ -1038,7 +1133,6 @@ consdiff_apply_diff(const smartlist_t *cons1,
 
  done:
   if (cons2) {
-    SMARTLIST_FOREACH(cons2, char *, cp, tor_free(cp));
     smartlist_free(cons2);
   }
 
@@ -1046,34 +1140,40 @@ consdiff_apply_diff(const smartlist_t *cons1,
 }
 
 /**
- * Helper: For every NL-terminated line in <b>s</b>, add that line
- * (without trailing newline) to <b>out</b>.  Return -1 if there are any
- * non-NL terminated lines; 0 otherwise.
- *
- * Modifies <b>s</b> in place: don't do anything with <b>s</b> after you're
- * done here, besides freeing it.
+ * Helper: For every NL-terminated line in <b>s</b>, add a cdline referring to
+ * that line (without trailing newline) to <b>out</b>.  Return -1 if there are
+ * any non-NL terminated lines; 0 otherwise.
  *
  * Unlike tor_split_lines, this function avoids ambiguity on its
  * handling of a final line that isn't NL-terminated.
+ *
+ * All cdline_t objects are allocated in the provided memarea.  Strings
+ * are not copied: if <b>s</b> changes or becomes invalid, then all
+ * generated cdlines will become invalid.
  */
-static int
-consensus_split_lines(smartlist_t *out, char *s)
+STATIC int
+consensus_split_lines(smartlist_t *out, const char *s, memarea_t *area)
 {
-  /* XXXX If we used string slices, we could avoid a bunch of copies here. */
   while (*s) {
-    char *eol = strchr(s, '\n');
+    const char *eol = strchr(s, '\n');
     if (!eol) {
       /* File doesn't end with newline. */
       return -1;
     }
-    *eol = 0;
-    smartlist_add(out, s);
+    if (eol - s > UINT32_MAX) {
+      /* Line is far too long. */
+      return -1;
+    }
+    cdline_t *line = memarea_alloc(area, sizeof(cdline_t));
+    line->s = s;
+    line->len = (uint32_t)(eol - s);
+    smartlist_add(out, line);
     s = eol+1;
   }
   return 0;
 }
 
-/** Given a list of lines, return a newly allocated string containing
+/** Given a list of cdline_t, return a newly allocated string containing
  * all of the lines, terminated with NL, concatenated.
  *
  * Unlike smartlist_join_strings(), avoids lossy operations on empty
@@ -1082,16 +1182,15 @@ static char *
 consensus_join_lines(const smartlist_t *inp)
 {
   size_t n = 0;
-  SMARTLIST_FOREACH(inp, const char *, cp, n += strlen(cp) + 1);
+  SMARTLIST_FOREACH(inp, const cdline_t *, cdline, n += cdline->len + 1);
   n += 1;
   char *result = tor_malloc(n);
   char *out = result;
-  SMARTLIST_FOREACH_BEGIN(inp, const char *, cp) {
-    size_t len = strlen(cp);
-    memcpy(out, cp, len);
-    out += len;
+  SMARTLIST_FOREACH_BEGIN(inp, const cdline_t *, cdline) {
+    memcpy(out, cdline->s, cdline->len);
+    out += cdline->len;
     *out++ = '\n';
-  } SMARTLIST_FOREACH_END(cp);
+  } SMARTLIST_FOREACH_END(cdline);
   *out++ = '\0';
   tor_assert(out == result+n);
   return result;
@@ -1114,28 +1213,26 @@ consensus_diff_generate(const char *cons1,
   if (BUG(r1 < 0 || r2 < 0))
     return NULL; // LCOV_EXCL_LINE
 
-  char *cons1_copy = tor_strdup(cons1);
-  char *cons2_copy = tor_strdup(cons2);
+  memarea_t *area = memarea_new();
   lines1 = smartlist_new();
   lines2 = smartlist_new();
-  if (consensus_split_lines(lines1, cons1_copy) < 0)
+  if (consensus_split_lines(lines1, cons1, area) < 0)
     goto done;
-  if (consensus_split_lines(lines2, cons2_copy) < 0)
+  if (consensus_split_lines(lines2, cons2, area) < 0)
     goto done;
 
-  result_lines = consdiff_gen_diff(lines1, lines2, &d1, &d2);
+  result_lines = consdiff_gen_diff(lines1, lines2, &d1, &d2, area);
 
  done:
-  smartlist_free(lines1);
-  smartlist_free(lines2);
-  tor_free(cons1_copy);
-  tor_free(cons2_copy);
-
   if (result_lines) {
     result = consensus_join_lines(result_lines);
-    SMARTLIST_FOREACH(result_lines, char *, cp, tor_free(cp));
     smartlist_free(result_lines);
   }
+
+  memarea_drop_all(area);
+  smartlist_free(lines1);
+  smartlist_free(lines2);
+
   return result;
 }
 
@@ -1150,18 +1247,17 @@ consensus_diff_apply(const char *consensus,
   smartlist_t *lines1 = NULL, *lines2 = NULL;
   int r1;
   char *result = NULL;
+  memarea_t *area = memarea_new();
 
   r1 = consensus_compute_digest(consensus, &d1);
   if (BUG(r1 < 0))
     return NULL; // LCOV_EXCL_LINE
 
-  char *cons_copy = tor_strdup(consensus);
-  char *diff_copy = tor_strdup(diff);
   lines1 = smartlist_new();
   lines2 = smartlist_new();
-  if (consensus_split_lines(lines1, cons_copy) < 0)
+  if (consensus_split_lines(lines1, consensus, area) < 0)
     goto done;
-  if (consensus_split_lines(lines2, diff_copy) < 0)
+  if (consensus_split_lines(lines2, diff, area) < 0)
     goto done;
 
   result = consdiff_apply_diff(lines1, lines2, &d1);
@@ -1169,8 +1265,7 @@ consensus_diff_apply(const char *consensus,
  done:
   smartlist_free(lines1);
   smartlist_free(lines2);
-  tor_free(cons_copy);
-  tor_free(diff_copy);
+  memarea_drop_all(area);
 
   return result;
 }

+ 24 - 6
src/or/consdiff.h

@@ -13,6 +13,16 @@ char *consensus_diff_apply(const char *consensus,
                            const char *diff);
 
 #ifdef CONSDIFF_PRIVATE
+struct memarea_t;
+
+/** Line type used for constructing consensus diffs.  Each of these lines
+ * refers to a chunk of memory allocated elsewhere, and is not necessarily
+ * NUL-terminated: this helps us avoid copies and save memory. */
+typedef struct cdline_t {
+  const char *s;
+  uint32_t len;
+} cdline_t;
+
 typedef struct consensus_digest_t {
   uint8_t sha3_256[DIGEST256_LEN];
 } consensus_digest_t;
@@ -20,7 +30,8 @@ typedef struct consensus_digest_t {
 STATIC smartlist_t *consdiff_gen_diff(const smartlist_t *cons1,
                                       const smartlist_t *cons2,
                                       const consensus_digest_t *digests1,
-                                      const consensus_digest_t *digests2);
+                                      const consensus_digest_t *digests2,
+                                      struct memarea_t *area);
 STATIC char *consdiff_apply_diff(const smartlist_t *cons1,
                                  const smartlist_t *diff,
                                  const consensus_digest_t *digests1);
@@ -41,7 +52,8 @@ typedef struct smartlist_slice_t {
   int len;
 } smartlist_slice_t;
 STATIC smartlist_t *gen_ed_diff(const smartlist_t *cons1,
-                                const smartlist_t *cons2);
+                                const smartlist_t *cons2,
+                                struct memarea_t *area);
 STATIC smartlist_t *apply_ed_diff(const smartlist_t *cons1,
                                   const smartlist_t *diff,
                                   int start_line);
@@ -54,14 +66,20 @@ STATIC int *lcs_lengths(const smartlist_slice_t *slice1,
                         const smartlist_slice_t *slice2,
                         int direction);
 STATIC void trim_slices(smartlist_slice_t *slice1, smartlist_slice_t *slice2);
-STATIC int base64cmp(const char *hash1, const char *hash2);
-STATIC const char *get_id_hash(const char *r_line);
-STATIC int is_valid_router_entry(const char *line);
+STATIC int base64cmp(const cdline_t *hash1, const cdline_t *hash2);
+STATIC int get_id_hash(const cdline_t *line, cdline_t *hash_out);
+STATIC int is_valid_router_entry(const cdline_t *line);
 STATIC int smartlist_slice_string_pos(const smartlist_slice_t *slice,
-                                      const char *string);
+                                      const cdline_t *string);
 STATIC void set_changed(bitarray_t *changed1, bitarray_t *changed2,
                         const smartlist_slice_t *slice1,
                         const smartlist_slice_t *slice2);
+STATIC int consensus_split_lines(smartlist_t *out, const char *s,
+                                 struct memarea_t *area);
+STATIC void smartlist_add_linecpy(smartlist_t *lst, struct memarea_t *area,
+                                  const char *s);
+STATIC int lines_eq(const cdline_t *a, const cdline_t *b);
+STATIC int line_str_eq(const cdline_t *a, const char *b);
 
 MOCK_DECL(STATIC int,
           consensus_compute_digest,(const char *cons,

+ 22 - 0
src/test/bench.c

@@ -28,6 +28,7 @@ const char tor_git_revision[] = "";
 #include "crypto_curve25519.h"
 #include "onion_ntor.h"
 #include "crypto_ed25519.h"
+#include "consdiff.h"
 
 #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_PROCESS_CPUTIME_ID)
 static uint64_t nanostart;
@@ -674,6 +675,27 @@ main(int argc, const char **argv)
 
   tor_threads_init();
 
+  if (argc == 4 && !strcmp(argv[1], "diff")) {
+    init_logging(1);
+    const int N = 200;
+    char *f1 = read_file_to_str(argv[2], RFTS_BIN, NULL);
+    char *f2 = read_file_to_str(argv[3], RFTS_BIN, NULL);
+    if (! f1 || ! f2) {
+      perror("X");
+      return 1;
+    }
+    for (i = 0; i < N; ++i) {
+      char *diff = consensus_diff_generate(f1, f2);
+      tor_free(diff);
+    }
+    char *diff = consensus_diff_generate(f1, f2);
+    printf("%s", diff);
+    tor_free(f1);
+    tor_free(f2);
+    tor_free(diff);
+    return 0;
+  }
+
   for (i = 1; i < argc; ++i) {
     if (!strcmp(argv[i], "--list")) {
       list = 1;

File diff suppressed because it is too large
+ 233 - 224
src/test/test_consdiff.c


Some files were not shown because too many files changed in this diff