AFL源码阅读（六）：队列、变异、同步

　　上一篇文章已经解释了 fuzzer 如何与 fork server 通讯以收集程序信息。现在，我们来进入 afl-fuzz.c 的核心流程——那个巨大的 fuzz 循环。

主循环

　　代码如下：

  // fuzz 主循环
  while (1) {

    u8 skipped_fuzz;

    // 精简队列
    cull_queue();

    if (!queue_cur) {
      // 在fuzz循环最开始，或fuzz每完成一轮后，会进入这个分支

      queue_cycle++;
      current_entry     = 0;
      cur_skipped_paths = 0;
      queue_cur         = queue;

      // 在 resume 模式下，恢复先前的位置
      while (seek_to) {
        current_entry++;
        seek_to--;
        queue_cur = queue_cur->next;
      }

      // 刷新 ui
      show_stats();

      if (not_on_tty) {
        ACTF("Entering queue cycle %llu.", queue_cycle);
        fflush(stdout);
      }

      /* If we had a full queue cycle with no new finds, try
         recombination strategies next. */

      if (queued_paths == prev_queued) {
        // 如果整轮没有新发现，则考虑打开 use_splicing 
        if (use_splicing) cycles_wo_finds++; else use_splicing = 1;

      } else cycles_wo_finds = 0;

      prev_queued = queued_paths;

      // 按照文档，设置环境变量 AFL_IMPORT_FIRST，可以让 AFL 在干活之前先与其他 fuzzer 同步一次
      if (sync_id && queue_cycle == 1 && getenv("AFL_IMPORT_FIRST"))
        sync_fuzzers(use_argv);

    }

    // 以这个用例为基础，变异出新的用例去实验。fuzz_one 返回 1 表示这个用例被跳过
    skipped_fuzz = fuzz_one(use_argv);

    // 若这个用例没有被跳过，且处于并行模式
    if (!stop_soon && sync_id && !skipped_fuzz) {
      // 每发生 5 次这样的事件，就执行 sync_fuzzers()
      if (!(sync_interval_cnt++ % SYNC_INTERVAL))
        sync_fuzzers(use_argv);

    }

    if (!stop_soon && exit_1) stop_soon = 2;

    if (stop_soon) break;

    // 轮到下一个输入
    queue_cur = queue_cur->next;
    current_entry++;

  }

　　可以看到，AFL 一次次地遍历 queue，针对 queue 里的每一个元素：

调用 fuzz_one()，以这个用例为基础，变异出许多新的用例，拿去实验
并行模式下，每处理完 5 个用例的变异，便与其他 fuzzer 同步。

　　当处理完队列中的全部用例时，如果整轮都没有新发现，则打开 splicing 变异阶段。另外，维护一个计数器 cycles_wo_finds 表示「连续多少轮没有新发现」。

💡

AFL 没有定期同步机制。AFL++ 文档建议每 60 分钟（甚至更久）同步一次。

　　上面的代码中调用了函数 cull_queue 来重构 favored 集。它的算法在上篇文章已经详细介绍过，我们来看看实现。

cull_queue 函数（精简队列）

/* The second part of the mechanism discussed above is a routine that
   goes over top_rated[] entries, and then sequentially grabs winners for
   previously-unseen bytes (temp_v) and marks them as favored, at least
   until the next run. The favored entries are given more air time during
   all fuzzing steps. */

static void cull_queue(void) {

  struct queue_entry* q;
  static u8 temp_v[MAP_SIZE >> 3];
  u32 i;

  if (dumb_mode || !score_changed) return;

  score_changed = 0;

  // temp_v 是一个 bitmap，表示哪些位置还未被覆盖
  memset(temp_v, 255, MAP_SIZE >> 3);

  queued_favored  = 0;
  pending_favored = 0;

  q = queue;

  // 清空 favored 标记
  while (q) {
    q->favored = 0;
    q = q->next;
  }

  /* Let's see if anything in the bitmap isn't captured in temp_v.
     If yes, and if it has a top_rated[] contender, let's use it. */

  for (i = 0; i < MAP_SIZE; i++)
    if (top_rated[i] && (temp_v[i >> 3] & (1 << (i & 7)))) {
      // i 还没被覆盖，且它有偏爱用例。则把其偏爱用例加入 favored 集

      u32 j = MAP_SIZE >> 3;

      /* Remove all bits belonging to the current entry from temp_v. */

      // 用偏爱用例更新 temp_v
      while (j--) 
        if (top_rated[i]->trace_mini[j])
          temp_v[j] &= ~top_rated[i]->trace_mini[j];

      // 给偏爱用例设置 favored 标记
      top_rated[i]->favored = 1;
      queued_favored++;

      // 若这个偏爱用例还从来没被 fuzz 过，则增加 pending_favored 计数器
      if (!top_rated[i]->was_fuzzed) pending_favored++;

    }

  // 已完成 favored 集的重构
  q = queue;

  // 打个标记，方便一些分析，对 AFL 本身无用
  while (q) {
    mark_as_redundant(q, !q->favored);
    q = q->next;
  }

}

　　这个函数利用了一些位运算优化。它执行的操作与我们前一篇文章描述的算法一致。

　　显然，fuzz 的主要逻辑（变异策略等）位于 fuzz_one 函数中。这个函数长达 1688 行，我们过一会再看。现在，先关注 sync_fuzzers 逻辑，看各个 AFL 实例是如何同步的。

sync_fuzzers 函数

　　顾名思义，这个函数用于在各个 fuzzer 之间同步状态。上一篇文章分析过，若一个 fuzzer 实例处于并行状态，那么用户通过 -o 选项提供给它的工作目录，将被视为 sync_dir；它自己的 out_dir 则是 sync_dir 下以它的 sync_id 命名的子文件夹。另外，master 执行 deterministic 变异，slave 不执行 deterministic 变异。

　　既然 AFL 没有使用 SQLite 等数据库，它只能把文件系统当成数据库用。例如，在工作目录的 queue 子文件夹里保存队列中的用例。可以猜测，fuzzer 的同步过程，大致就是去读取其他实例的文件夹，把里面有用的东西拿过来。看看代码：

/* Grab interesting test cases from other fuzzers. */

static void sync_fuzzers(char** argv) {

  DIR* sd;
  struct dirent* sd_ent;
  u32 sync_cnt = 0;

  // sd 是同步目录
  sd = opendir(sync_dir);
  if (!sd) PFATAL("Unable to open '%s'", sync_dir);

  stage_max = stage_cur = 0;
  cur_depth = 0;

  /* Look at the entries created for every other fuzzer in the sync directory. */

  while ((sd_ent = readdir(sd))) {
    // 枚举 sync_dir 下的每一个目录或文件
    static u8 stage_tmp[128];

    DIR* qd;
    struct dirent* qd_ent;
    u8 *qd_path, *qd_synced_path;
    u32 min_accept = 0, next_min_accept;

    s32 id_fd;

    // 如果一个子文件夹拥有 queue 目录，就认为它是一个 fuzzer 实例的 out_dir

    /* Skip dot files and our own output directory. */
    if (sd_ent->d_name[0] == '.' || !strcmp(sync_id, sd_ent->d_name)) continue;

    /* Skip anything that doesn't have a queue/ subdirectory. */

    qd_path = alloc_printf("%s/%s/queue", sync_dir, sd_ent->d_name);

    if (!(qd = opendir(qd_path))) {
      ck_free(qd_path);
      continue;
    }

    // 现在 qd 是某个其他实例的 out_dir/queue；sd_ent->d_name 是该实例的 sync_id

    /* Retrieve the ID of the last seen test case. */

    // 在自己的 out_dir/.synced 下面，打开一个以其他实例 sync_id 为名的文件
    qd_synced_path = alloc_printf("%s/.synced/%s", out_dir, sd_ent->d_name);

    id_fd = open(qd_synced_path, O_RDWR | O_CREAT, 0600);

    if (id_fd < 0) PFATAL("Unable to create '%s'", qd_synced_path);

    // 若这个文件已经存在，则它的首 4 字节表示 min_accept
    if (read(id_fd, &min_accept, sizeof(u32)) > 0) 
      lseek(id_fd, 0, SEEK_SET);

    next_min_accept = min_accept;

    /* Show stats */    

    sprintf(stage_tmp, "sync %u", ++sync_cnt);
    stage_name = stage_tmp;
    stage_cur  = 0;
    stage_max  = 0;

    /* For every file queued by this fuzzer, parse ID and see if we have looked at
       it before; exec a test case if not. */

    // 打开其他 fuzzer 的 queue 目录，遍历其用例
    while ((qd_ent = readdir(qd))) {

      u8* path;
      s32 fd;
      struct stat st;

      // 跳过 . 开头的文件、跳过 id 小于 min_accept 的用例（id 就在文件名里）
      if (qd_ent->d_name[0] == '.' ||
          sscanf(qd_ent->d_name, CASE_PREFIX "%06u", &syncing_case) != 1 || 
          syncing_case < min_accept) continue;


      // 现在，这个用例我们没见过
      /* OK, sounds like a new one. Let's give it a try. */

      if (syncing_case >= next_min_accept)
        next_min_accept = syncing_case + 1;

      path = alloc_printf("%s/%s", qd_path, qd_ent->d_name);

      /* Allow this to fail in case the other fuzzer is resuming or so... */

      // 打开用例
      fd = open(path, O_RDONLY);

      if (fd < 0) {
         ck_free(path);
         continue;
      }

      if (fstat(fd, &st)) PFATAL("fstat() failed");

      /* Ignore zero-sized or oversized files. */

      if (st.st_size && st.st_size <= MAX_FILE) {

        u8  fault;
        // 利用 mmap 快速读入
        u8* mem = mmap(0, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);

        if (mem == MAP_FAILED) PFATAL("Unable to mmap '%s'", path);

        /* See what happens. We rely on save_if_interesting() to catch major
           errors and save the test case. */

        // 把这用例写进 out_file，并执行一次 run_target
        write_to_testcase(mem, st.st_size);

        // 注意 run_target 只会跑一遍程序并给 shm 分桶，不会把用例加入 queue
        fault = run_target(argv, exec_tmout);

        if (stop_soon) return;

        // 若这个用例有趣，则加入 queue
        syncing_party = sd_ent->d_name;
        queued_imported += save_if_interesting(argv, mem, st.st_size, fault);
        syncing_party = 0;

        munmap(mem, st.st_size);

        if (!(stage_cur++ % stats_update_freq)) show_stats();

      }

      ck_free(path);
      close(fd);

    }

    ck_write(id_fd, &next_min_accept, sizeof(u32), qd_synced_path);

    close(id_fd);
    closedir(qd);
    ck_free(qd_path);
    ck_free(qd_synced_path);
    
  }  

  closedir(sd);

}

　　上述代码细节比较多。对于每一个其他实例，fuzzer 要检查它的 queue 目录，寻找新的用例（这里有一个细节优化：fuzzer 对于每个其他实例，都记得自己看过哪些用例，以前看过的就不再看了）。对于新用例，也不是直接加入 queue，而是调用 run_target() 实际跑一遍这个用例，再调用 save_if_interesting()。我们上一篇文章已经分析过 run_target() 函数，现在看一下 save_if_interesting。

💡

从上面的代码可以看出，AFL 的同步是本 fuzzer 与其他所有 afl-fuzz 实例进行同步。而 AFL++ 出于性能考虑，将策略改为 master 获取所有实例的信息，slave 只获取 master 的信息。

save_if_interesting 函数（保存有趣的用例）

　　这个函数的签名是 static u8 save_if_interesting(char** argv, void* mem, u32 len, u8 fault)，其中 mem 是用例内容，len 是长度； fault 参数则是先前 run_target 函数返回的。

　　我们知道，run_target 函数返回值是：

FAULT_NONE(0)，表示 child 正常结束。
FAULT_TMOUT(1)，表示超时。
FAULT_CRASH(2)，表示程序崩溃。
FAULT_ERROR(3)，表示 fuzzer 本身出现问题。

　　这其中并没有提到是否发现了新的路径。因此， save_if_interesting 必须自行扫描 shm 区域，来判断这个用例是否有趣。现在来看代码：

/* Check if the result of an execve() during routine fuzzing is interesting,
   save or queue the input test case for further analysis if so. Returns 1 if
   entry is saved, 0 otherwise. */

static u8 save_if_interesting(char** argv, void* mem, u32 len, u8 fault) {

  u8  *fn = "";
  u8  hnb;
  s32 fd;
  u8  keeping = 0, res;

  // 若 fault = crash_mode = 2，则处于 crash exploration 模式且崩溃了
  // 若 fault = crash_mode = 0，则处于普通模式，且没有崩溃或超时
  // 因此，这个 if 分支是大部分用例的「正常情况」
  if (fault == crash_mode) {

    /* Keep only if there are new bits in the map, add to queue for
       future fuzzing, etc. */

    // 如果没发现新的路径，就忽略
    // has_new_bits 返回值：0 表示无成果；1 表示 hit count 变动；2 表示发现了新的边
    if (!(hnb = has_new_bits(virgin_bits))) {
      if (crash_mode) total_crashes++;
      return 0;
    }

    // 发现了新的路径，要将其加入 queue 并存到文件中

#ifndef SIMPLE_FILES

    // 这是用例的文件名，描述了 id、来历
    fn = alloc_printf("%s/queue/id:%06u,%s", out_dir, queued_paths,
                      describe_op(hnb));

#else

    fn = alloc_printf("%s/queue/id_%06u", out_dir, queued_paths);

#endif /* ^!SIMPLE_FILES */

    // 入队
    add_to_queue(fn, len, 0);

    // 若覆盖到了新的边
    if (hnb == 2) {
      queue_top->has_new_cov = 1;
      queued_with_cov++;
    }

    // 保存执行路径 hash
    queue_top->exec_cksum = hash32(trace_bits, MAP_SIZE, HASH_CONST);

    /* Try to calibrate inline; this also calls update_bitmap_score() when
       successful. */

    // 校准
    res = calibrate_case(argv, queue_top, mem, queue_cycle - 1, 0);

    if (res == FAULT_ERROR)
      FATAL("Unable to execute target application");

    // 写入文件系统
    fd = open(fn, O_WRONLY | O_CREAT | O_EXCL, 0600);
    if (fd < 0) PFATAL("Unable to create '%s'", fn);
    ck_write(fd, mem, len, fn);
    close(fd);

    keeping = 1;

  }

  // 分类讨论 child 退出原因
  switch (fault) {
  // ...

　　对于正常情况（也就是说，普通模式下 child 正常退出，crash 模式下程序果真 crash），我们考虑是否将其加入 queue。这里调用了 has_new_bits() 判断是否为本质不同的路径，若是，则加入 queue、执行校准、并写进文件系统。

💡

AFL 中维护了三套「还未被探索过的情形」信息，数据类型为 u8[65536]，初始为全 255。分别是：
- virgin_bits，常规 fuzz 过程的探索情况
- virgin_tmout，超时用例的探索情况
- virgin_crash，crash 用例的探索情况
例如，假设 virgin_bits[12345] = 0b01101011，那就说明 id 为 12345 的这条边，在过往的 fuzz 过程中，命中过 4、16、128 这三个桶；其余 5 个桶则未被命中。

函数 has_new_bits(u8* virgin_map) 的功能即是将当前的 trace_bits 与指定的 virgin_map 对照，观察当前路径是否与过往的路径本质不同。由于桶 id 总是 2 的幂次，因此通过简单的位运算，就能看出 trace_bits 是否有覆盖到 ~virgin_map 的部分。

　　形象地说，AFL 中一共有 $65536 \times 8 = 524288$ 个「任务」，每个任务形如「击中 $12345$ 这条边 $32\sim 127$ 次」。每个测试用例在运行一次之后，便可以完成一些任务；若一个用例完成了以前从未被完成过的任务，那它就会被保留。

　　一个人可能会认为，假如现在队列里面有用例 A 和 B，现在变异出一个新用例 C，其运行路径 hash 与 A、B 都不同，那 C 就会被加入队列。但这是错误的——C 必须完成 A 和 B 都未能完成的任务，才可以跻身于队列中。举个例子：$$\begin{aligned}A: \quad & X\to Y \\B: \quad & Y \to Z \\C: \quad & X \to Y \to Z\end{aligned}$$　　这三者的执行路径 hash 都不一样，但由于 A 和 B 已经完成了「$X\to Y$，一次」和「$Y\to Z$，一次」的任务，故 C 没有完成任何新任务，于是 C 不会被加入队列。因此，从数学上说，队列在理论上最多有 524288 个元素。

　　来接着看 save_if_interesting 源码。在考虑完要不要把一个元素加入 queue 后，再考虑要不要将其保存到文件系统。逻辑如下：


  // 分类讨论 child 退出原因
  switch (fault) {

    case FAULT_TMOUT:
      // child 超时，不过 shm 里面仍然有一点信息可以用

      /* Timeouts are not very interesting, but we're still obliged to keep
         a handful of samples. We use the presence of new bits in the
         hang-specific bitmap as a signal of uniqueness. In "dumb" mode, we
         just keep everything. */

      total_tmouts++;

      // 若 unique_hangs 超过 500 个，直接丢弃
      if (unique_hangs >= KEEP_UNIQUE_HANG) return keeping;

      if (!dumb_mode) {

#ifdef WORD_SIZE_64
        // simplify_trace 函数是只保留「是否命中」而不保留 count
        simplify_trace((u64*)trace_bits);
#else
        simplify_trace((u32*)trace_bits);
#endif /* ^WORD_SIZE_64 */

        // 若在 timeout 用例中，没有发现新的边，则丢弃
        if (!has_new_bits(virgin_tmout)) return keeping;

      }

      // 现在，这个 timeout 用例与其他 timeout 用例有本质区别，所以保留

      unique_tmouts++;

      /* Before saving, we make sure that it's a genuine hang by re-running
         the target with a more generous timeout (unless the default timeout
         is already generous). */
      if (exec_tmout < hang_tmout) {
        // 给多点时间，重跑一遍
        u8 new_fault;
        write_to_testcase(mem, len);
        new_fault = run_target(argv, hang_tmout);

        /* A corner case that one user reported bumping into: increasing the
           timeout actually uncovers a crash. Make sure we don't discard it if
           so. */

        // 重跑结果是 crash，报告
        if (!stop_soon && new_fault == FAULT_CRASH) goto keep_as_crash;

        // 重跑正常结束了，丢弃
        if (stop_soon || new_fault != FAULT_TMOUT) return keeping;

      }

#ifndef SIMPLE_FILES
      // 这个 hang 用例要被保留，确定文件名
      fn = alloc_printf("%s/hangs/id:%06llu,%s", out_dir,
                        unique_hangs, describe_op(0));

#else

      fn = alloc_printf("%s/hangs/id_%06llu", out_dir,
                        unique_hangs);

#endif /* ^!SIMPLE_FILES */

      unique_hangs++;

      last_hang_time = get_cur_time();

      break;

    case FAULT_CRASH:

keep_as_crash:

      /* This is handled in a manner roughly similar to timeouts,
         except for slightly different limits and no need to re-run test
         cases. */

      total_crashes++;

      // 如果 crash 用例太多了，则丢弃（默认上限 5000）
      if (unique_crashes >= KEEP_UNIQUE_CRASH) return keeping;

      if (!dumb_mode) {

#ifdef WORD_SIZE_64
        // uint8 count 变 bool
        simplify_trace((u64*)trace_bits);
#else
        simplify_trace((u32*)trace_bits);
#endif /* ^WORD_SIZE_64 */

        // 若这与其他的 crash 本质相同，则丢弃
        if (!has_new_bits(virgin_crash)) return keeping;
      }

      if (!unique_crashes) write_crash_readme();

#ifndef SIMPLE_FILES

      // 这个 crash 要保留
      fn = alloc_printf("%s/crashes/id:%06llu,sig:%02u,%s", out_dir,
                        unique_crashes, kill_signal, describe_op(0));

#else

      fn = alloc_printf("%s/crashes/id_%06llu_%02u", out_dir, unique_crashes,
                        kill_signal);

#endif /* ^!SIMPLE_FILES */

      unique_crashes++;

      last_crash_time = get_cur_time();
      last_crash_execs = total_execs;

      break;

    case FAULT_ERROR: FATAL("Unable to execute target application");

    default: return keeping;

  }

  /* If we're here, we apparently want to save the crash or hang
     test case, too. */

  // 存到文件中
  fd = open(fn, O_WRONLY | O_CREAT | O_EXCL, 0600);
  if (fd < 0) PFATAL("Unable to create '%s'", fn);
  ck_write(fd, mem, len, fn);
  close(fd);

  ck_free(fn);

  return keeping;

}

　　可见，AFL 运行过程中，至多保留 500 个 hang 用例和 5000 个 crash 用例，且如果一个 crash 用例对比起之前的 crash 用例，没有探索到新的边，则它会被丢弃。hang 用例也照此办理，不过它们会被重新跑一次，看看是否仍然超时。

💡

由于 shm 要经历 simplify_trace() 过程，保存一个 hang 或 crash 的条件比正常用例入队更加苛刻——常规用例只要能发现新的 hit count，就能入队；hang 或 crash 则必须发现新的边，才能被保存。

　　以上，我们分析完了 save_if_interesting 函数。它通过一套算法，判断一个用例是否值得保留。如果需要入队，则调用 add_to_queue() 将用例加入 queue。接下来，我们分析 add_to_queue 函数，以了解 queue 的结构。

add_to_queue 函数（将用例加入队列）

　　在看这个函数之前，我们先了解一下队列元素的结构：

struct queue_entry {

  u8* fname;                          /* File name for the test case      */
  u32 len;                            /* Input length                     */

  u8  cal_failed,                     /* Calibration failed?              */
      trim_done,                      /* Trimmed?                         */
      was_fuzzed,                     /* Had any fuzzing done yet?        */
      passed_det,                     /* Deterministic stages passed?     */
      has_new_cov,                    /* Triggers new coverage?           */
      var_behavior,                   /* Variable behavior?               */
      favored,                        /* Currently favored?               */
      fs_redundant;                   /* Marked as redundant in the fs?   */

  u32 bitmap_size,                    /* Number of bits set in bitmap     */
      exec_cksum;                     /* Checksum of the execution trace  */

  u64 exec_us,                        /* Execution time (us)              */
      handicap,                       /* Number of queue cycles behind    */
      depth;                          /* Path depth                       */

  u8* trace_mini;                     /* Trace bytes, if kept             */
  u32 tc_ref;                         /* Trace bytes ref count            */

  struct queue_entry *next,           /* Next element, if any             */
                     *next_100;       /* 100 elements ahead               */

};

　　显然，queue 是一个链表。额外地，有个 next_100 指针，指向其后面第 100 个元素。这是跳表（skiplist）的思路。queue entry 里面有些字段，我们已经见过了：

exec_us 字段，在校准时被设为该用例运行耗时的平均值
favored 、trace_mini 和 tc_ref 与 favored 集相关
exec_cksum 是执行路径 hash

　　其余字段，在阅读 fuzz_one 函数的时候自然会遇到。现在，我们来看 add_to_queue 函数：

/* Append new test case to the queue. */

static void add_to_queue(u8* fname, u32 len, u8 passed_det) {

  struct queue_entry* q = ck_alloc(sizeof(struct queue_entry));

  q->fname        = fname;
  q->len          = len;
  q->depth        = cur_depth + 1;    // cur_depth 在 fuzz_one 阶段被设置为当前被变异用例的深度
  q->passed_det   = passed_det;       // 是否完成了 deterministic 变异

  // 更新 max_depth
  if (q->depth > max_depth) max_depth = q->depth;

  // 追加到队尾
  if (queue_top) {

    queue_top->next = q;
    queue_top = q;

  } else q_prev100 = queue = queue_top = q;

  // 更新计数器
  queued_paths++;
  pending_not_fuzzed++;


  // 由于现在要把一个用例加入队列，那说明这个用例肯定是有趣的，所以这一轮发现了新东西
  cycles_wo_finds = 0;

  // 维护 next_100 指针。q_prev100 是全局变量
  /* Set next_100 pointer for every 100th element (index 0, 100, etc) to allow faster iteration. */
  if ((queued_paths - 1) % 100 == 0 && queued_paths > 1) {

    q_prev100->next_100 = q;
    q_prev100 = q;

  }

  last_path_time = get_cur_time();

}

　　可见逻辑比较简单，就是往队列尾部追加一个结点。需要注意的是，仅 0、100、200……这些结点记录了 next_100 指针，所以假如我们要从 114 跳转到 514，必须先一步步走到 200，再跳转到 300、400、500，再一步步走到 514。

　　上面的代码里面提到了 depth 字段和 passed_det 字段。由于 AFL 是变异式 fuzzer，每个用例肯定是从某个已有用例的基础上变异来的。因此，假如初始语料集的深度为 1，那么从初始语料集直接变异出的用例，其深度就为 2，以此类推。至于 passed_det 字段，它用来表示一个用例是否完成过 deterministic 变异。

💡

deterministic 变异阶段，是对用例执行一套「确定性」的变异策略，例如逐 bit 翻转。
由于 deterministic 变异很慢（实验次数与用例长度成正比），且对一个用例多次执行 deterministic 变异阶段，显然不可能有新的发现，因此 AFL 做了一个优化：假如一个用例已经完成过 deterministic 变异，就不会再做了。

　　现在，我们已经了解 corpus 是如何被存储在队列中的，也知道了一个用例如何被判定为「有趣」、如何在各个 fuzzer 之间同步。接下来，只要源源不断地产生新用例，交由 run_target 函数执行实验，然后由 save_if_interesting 判断是否有趣并保存，一个完整的 fuzzer 便诞生了。在主函数中我们已经看到，对于队列中的每一个元素，都将会调用 fuzz_one() 尝试在其基础上变异。接下来，我们阅读 fuzz_one 函数。

fuzz_one 函数（从一个用例变异出许多用例）

　　这是一个非常长的函数，我们应当先搞清它的结构。粗看一眼：

/* Take the current entry from the queue, fuzz it for a while. This
   function is a tad too long... returns 0 if fuzzed successfully, 1 if
   skipped or bailed out. */

static u8 fuzz_one(char** argv) {
  // 决定是否直接跳过这个用例

  // 准备用例文件
  // 校准（若有必要）
  // 用例裁剪
  // 计算 perf_score
  // 决定是否要跳过 deterministic 阶段

  // 下面开始 deterministic 阶段
  // ..................
  // ..................

  // 下面开始 havoc 阶段
  // ..................
  // ..................

  // 下面开始 splicing 阶段
  // ..................
  // ..................

  // 打扫现场
}

　　显然，这个函数首先进行一些准备工作（例如把用例内容 mmap 到内存中），然后分别执行 deterministic、havoc、splicing 三个阶段，最后做一点清扫。这样划分之后，我们来依次阅读各个部分。

fuzz_one - 准备工作

　　拿到一个用例后，首先决定是否直接跳过它。代码如下：

static u8 fuzz_one(char** argv) {

  s32 len, fd, temp_len, i, j;
  u8  *in_buf, *out_buf, *orig_in, *ex_tmp, *eff_map = 0;
  u64 havoc_queued,  orig_hit_cnt, new_hit_cnt;
  u32 splice_cycle = 0, perf_score = 100, orig_perf, prev_cksum, eff_cnt = 1;

  u8  ret_val = 1, doing_det = 0;

  u8  a_collect[MAX_AUTO_EXTRA];
  u32 a_len = 0;

  if (pending_favored) {
    // 若有暂未被 fuzz 过的 favored 用例

    /* If we have any favored, non-fuzzed new arrivals in the queue,
       possibly skip to them at the expense of already-fuzzed or non-favored
       cases. */
    // 若当前用例已经被 fuzz 过了，或当前用例并非 favored，则以 99% 的概率跳过，让新 favored 用例先 fuzz
    if ((queue_cur->was_fuzzed || !queue_cur->favored) &&
        UR(100) < SKIP_TO_NEW_PROB) return 1;

  } else if (!dumb_mode && !queue_cur->favored && queued_paths > 10) {
    // 所有 favored 用例都被 fuzz 过，且当前用例并非 favored，且 corpus 大小超过 10

    /* Otherwise, still possibly skip non-favored cases, albeit less often.
       The odds of skipping stuff are higher for already-fuzzed inputs and
       lower for never-fuzzed entries. */
    if (queue_cycle > 1 && !queue_cur->was_fuzzed) {
      // 若当前用例没有被 fuzz 过，以 75% 的概率跳过
      if (UR(100) < SKIP_NFAV_NEW_PROB) return 1;
    } else {
      // 若当前用例被 fuzz 过，则以 95% 的概率跳过
      if (UR(100) < SKIP_NFAV_OLD_PROB) return 1;
    }
  }

  if (not_on_tty) {
    ACTF("Fuzzing test case #%u (%u total, %llu uniq crashes found)...",
         current_entry, queued_paths, unique_crashes);
    fflush(stdout);
  }

　　上述代码的功能是概率性地跳过 non-favored 用例。白皮书中提到，当考虑 fuzz 一个 non-favored 用例时：

若队列中存在一个从来没被 fuzz 过的 favored 用例，则以 99% 概率跳过当前用例（要尽快去 fuzz 全新 favored 用例）
若没有全新的 favored 用例，且当前用例已经被 fuzz 过，则以 95% 概率跳过
若没有全新的 favored 用例，而当前用例没被 fuzz 过，则以 75% 概率跳过

　　于是，在 fuzz 运行后期，一个 non-favored 用例被跳过的几率高达 95%。这节省下来的时间，投入到 favored 用例的 fuzz 去了。这究竟是否合理，有待商榷（事实上 favored 集的选取过程也不是无懈可击）——存在很多论文，通过改进 AFL 对各个种子的资源分配，提升了挖漏洞的效率。AFLFast 就是其代表。

  /* Map the test case into memory. */
  // 要 fuzz 的用例文件
  fd = open(queue_cur->fname, O_RDONLY);

  if (fd < 0) PFATAL("Unable to open '%s'", queue_cur->fname);

  len = queue_cur->len;

  // 调用 mmap 把文件挂进地址空间
  orig_in = in_buf = mmap(0, len, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);

  if (orig_in == MAP_FAILED) PFATAL("Unable to mmap '%s'", queue_cur->fname);

  close(fd);

  /* We could mmap() out_buf as MAP_PRIVATE, but we end up clobbering every
     single byte anyway, so it wouldn't give us any performance or memory usage
     benefits. */

  // out_buf 用来存储变异出来的用例，要交由目标程序执行
  out_buf = ck_alloc_nozero(len);

  subseq_tmouts = 0;

  // 设置全局变量 cur_depth 为本用例的深度
  cur_depth = queue_cur->depth;

　　这段代码把用例文件 mmap 进地址空间，并给 out_buf 分配内存。这个 out_buf 用于存储变异出来的用例。

/*******************************************
   * CALIBRATION (only if failed earlier on) *
   *******************************************/

  // 若本用例上次校准失败，则重新校准一次
  if (queue_cur->cal_failed) {

    u8 res = FAULT_TMOUT;

    // 最多校准一个用例 3 次
    if (queue_cur->cal_failed < CAL_CHANCES) {

      /* Reset exec_cksum to tell calibrate_case to re-execute the testcase
         avoiding the usage of an invalid trace_bits.
         For more info: https://github.com/AFLplusplus/AFLplusplus/pull/425 */

      queue_cur->exec_cksum = 0;

      res = calibrate_case(argv, queue_cur, in_buf, queue_cycle - 1, 0);

      if (res == FAULT_ERROR)
        FATAL("Unable to execute target application");

    }

    // 若行为异常（理应正常运行的用例 crash 掉了），则跳过这个用例
    if (stop_soon || res != crash_mode) {
      cur_skipped_paths++;
      goto abandon_entry;
    }

  }

　　这是对当前用例进行校准。我们提到过，进了 queue 的用例都需要校准，无论是初始 corpus 还是 fuzz 过程中发现的有趣用例。但校准不一定成功，所以在 fuzz 这个用例的时候，若它没有被校准成功过，则尝试校准。

  /************
   * TRIMMING *
   ************/

  // 如果当前用例没有被裁剪过，则进行裁剪
  if (!dumb_mode && !queue_cur->trim_done) {

    u8 res = trim_case(argv, queue_cur, in_buf);

    if (res == FAULT_ERROR)
      FATAL("Unable to execute target application");

    if (stop_soon) {
      cur_skipped_paths++;
      goto abandon_entry;
    }

    /* Don't retry trimming, even if it failed. */

    queue_cur->trim_done = 1;

    // 更新裁剪后的用例长度
    if (len != queue_cur->len) len = queue_cur->len;

  }
  
  memcpy(out_buf, in_buf, len);

　　这部分逻辑是用例裁剪。我们曾经分析过 afl-tmin，那是一个比较细致的裁剪器。而 afl-fuzz 中的裁剪算法如下：

/* Trim all new test cases to save cycles when doing deterministic checks. The
   trimmer uses power-of-two increments somewhere between 1/16 and 1/1024 of
   file size, to keep the stage short and sweet. */

static u8 trim_case(char** argv, struct queue_entry* q, u8* in_buf) {

  static u8 tmp[64];
  static u8 clean_trace[MAP_SIZE];

  u8  needs_write = 0, fault = 0;
  u32 trim_exec = 0;
  u32 remove_len;
  u32 len_p2;

  /* Although the trimmer will be less useful when variable behavior is
     detected, it will still work to some extent, so we don't check for
     this. */
  
  // 不裁剪长度小于 5 的用例
  if (q->len < 5) return 0;

  // 更新全局变量
  stage_name = tmp;
  bytes_trim_in += q->len;

  /* Select initial chunk len, starting with large steps. */

  // >= len 的 2 的幂次
  len_p2 = next_p2(q->len);

  // 把输入分成 16 块。最小块长度是 4
  remove_len = MAX(len_p2 / TRIM_START_STEPS, TRIM_MIN_BYTES);

  /* Continue until the number of steps gets too high or the stepover
     gets too small. */

  // 尝试以 remove_len 为块长进行裁剪
  while (remove_len >= MAX(len_p2 / TRIM_END_STEPS, TRIM_MIN_BYTES)) {

    u32 remove_pos = remove_len;

    sprintf(tmp, "trim %s/%s", DI(remove_len), DI(remove_len));

    stage_cur = 0;
    stage_max = q->len / remove_len;

    while (remove_pos < q->len) {

      u32 trim_avail = MIN(remove_len, q->len - remove_pos);
      u32 cksum;

      // 裁掉这块，写入 out_file
      write_with_gap(in_buf, q->len, remove_pos, trim_avail);

      // 试运行
      fault = run_target(argv, exec_tmout);
      trim_execs++;

      if (stop_soon || fault == FAULT_ERROR) goto abort_trimming;

      /* Note that we don't keep track of crashes or hangs here; maybe TODO? */

      cksum = hash32(trace_bits, MAP_SIZE, HASH_CONST);

      /* If the deletion had no impact on the trace, make it permanent. This
         isn't perfect for variable-path inputs, but we're just making a
         best-effort pass, so it's not a big deal if we end up with false
         negatives every now and then. */

      // 若行为不变，则可以裁剪
      if (cksum == q->exec_cksum) {

        u32 move_tail = q->len - remove_pos - trim_avail;

        q->len -= trim_avail;
        len_p2  = next_p2(q->len);

        memmove(in_buf + remove_pos, in_buf + remove_pos + trim_avail, 
                move_tail);

        /* Let's save a clean trace, which will be needed by
           update_bitmap_score once we're done with the trimming stuff. */

        if (!needs_write) {

          needs_write = 1;
          memcpy(clean_trace, trace_bits, MAP_SIZE);

        }

      } else remove_pos += remove_len;

      /* Since this can be slow, update the screen every now and then. */

      if (!(trim_exec++ % stats_update_freq)) show_stats();
      stage_cur++;

    }

    // 块长度减半
    remove_len >>= 1;

  }

  /* If we have made changes to in_buf, we also need to update the on-disk
     version of the test case. */

  if (needs_write) {

    s32 fd;

    unlink(q->fname); /* ignore errors */

    fd = open(q->fname, O_WRONLY | O_CREAT | O_EXCL, 0600);

    if (fd < 0) PFATAL("Unable to create '%s'", q->fname);

    ck_write(fd, in_buf, q->len, q->fname);
    close(fd);

    memcpy(trace_bits, clean_trace, MAP_SIZE);
    update_bitmap_score(q);

  }

abort_trimming:

  bytes_trim_out += q->len;
  return fault;

}

　　可见 afl-fuzz 中的用例裁剪算法，就是 afl-tmin 的子集。它只使用了 afl-tmin 中的 block deletion 优化，而没有使用 alphabet minimization 和 character minimization。这显然是为了提升 fuzz 效率，尽量少浪费时间。继续看 fuzz_one 函数：

  /*********************
   * PERFORMANCE SCORE *
   *********************/

  orig_perf = perf_score = calculate_score(queue_cur);

  /* Skip right away if -d is given, if we have done deterministic fuzzing on
     this entry ourselves (was_fuzzed), or if it has gone through deterministic
     testing in earlier, resumed runs (passed_det). */

  if (skip_deterministic || queue_cur->was_fuzzed || queue_cur->passed_det)
    goto havoc_stage;

  /* Skip deterministic fuzzing if exec path checksum puts this out of scope
     for this master instance. */

  if (master_max && (queue_cur->exec_cksum % master_max) != master_id - 1)
    goto havoc_stage;

  doing_det = 1;

　　fuzzer 会给用例打分，依靠这个分数决定 havoc 阶段的投入资源（指「实验次数」，下同）。另外，如果全局变量 skip_deterministic 为 1、或这个用例曾经被 fuzz 过、或这个用例已经完成了 deterministic 变异阶段，则跳过 deterministic 变异，直接进入 havoc。

　　如果同时运行了多个 master（AFL 确实支持这样做，需要提供特定的 -M 选项），则它只会承担自己该处理的那部分用例的 deterministic 变异。

　　这里 calculate_score 函数需要跟进。源码如下：

/* Calculate case desirability score to adjust the length of havoc fuzzing.
   A helper function for fuzz_one(). Maybe some of these constants should
   go into config.h. */

static u32 calculate_score(struct queue_entry* q) {
  // 全局平均校准时间
  u32 avg_exec_us = total_cal_us / total_cal_cycles;
  // 全局平均 bitmap 大小
  u32 avg_bitmap_size = total_bitmap_size / total_bitmap_entries;
  
  u32 perf_score = 100;

  /* Adjust score based on execution speed of this path, compared to the
     global average. Multiplier ranges from 0.1x to 3x. Fast inputs are
     less expensive to fuzz, so we're giving them more air time. */
  
  // 本用例跑得越快，得分越高
  if (q->exec_us * 0.1 > avg_exec_us) perf_score = 10;
  else if (q->exec_us * 0.25 > avg_exec_us) perf_score = 25;
  else if (q->exec_us * 0.5 > avg_exec_us) perf_score = 50;
  else if (q->exec_us * 0.75 > avg_exec_us) perf_score = 75;
  else if (q->exec_us * 4 < avg_exec_us) perf_score = 300;
  else if (q->exec_us * 3 < avg_exec_us) perf_score = 200;
  else if (q->exec_us * 2 < avg_exec_us) perf_score = 150;

  /* Adjust score based on bitmap size. The working theory is that better
     coverage translates to better targets. Multiplier from 0.25x to 3x. */

  // 本用例覆盖度越高，得分越高
  if (q->bitmap_size * 0.3 > avg_bitmap_size) perf_score *= 3;
  else if (q->bitmap_size * 0.5 > avg_bitmap_size) perf_score *= 2;
  else if (q->bitmap_size * 0.75 > avg_bitmap_size) perf_score *= 1.5;
  else if (q->bitmap_size * 3 < avg_bitmap_size) perf_score *= 0.25;
  else if (q->bitmap_size * 2 < avg_bitmap_size) perf_score *= 0.5;
  else if (q->bitmap_size * 1.5 < avg_bitmap_size) perf_score *= 0.75;

  /* Adjust score based on handicap. Handicap is proportional to how late
     in the game we learned about this path. Latecomers are allowed to run
     for a bit longer until they catch up with the rest. */

  // 如果该用例发现得比较晚，则多给一些得分
  if (q->handicap >= 4) {

    perf_score *= 4;
    q->handicap -= 4;

  } else if (q->handicap) {

    perf_score *= 2;
    q->handicap--;

  }

  /* Final adjustment based on input depth, under the assumption that fuzzing
     deeper test cases is more likely to reveal stuff that can't be
     discovered with traditional fuzzers. */

  // 该用例深度越大，分数越高
  switch (q->depth) {

    case 0 ... 3:   break;
    case 4 ... 7:   perf_score *= 2; break;
    case 8 ... 13:  perf_score *= 3; break;
    case 14 ... 25: perf_score *= 4; break;
    default:        perf_score *= 5;

  }

  /* Make sure that we don't go over limit. */

  // 顶多给 1600 分
  if (perf_score > HAVOC_MAX_MULT * 100) perf_score = HAVOC_MAX_MULT * 100;

  return perf_score;

}

　　这是一个很主观的评分标准。简而言之，跑得越快、覆盖度越高、深度越大，分数就会越高，在 havoc 阶段就会有更多资源来尝试变异。

　　到此为止，我们读完了 fuzz_one 的准备工作部分。接下来，就是变异策略了。

fuzz_one - deterministic 变异阶段

　　这部分内容可以结合 x1do0 的博客阅读。第一个确定性变异算子是 bitflip 1/1，代码如下：


  /*********************************************
   * SIMPLE BITFLIP (+dictionary construction) *
   *********************************************/

# 翻转 _ar 的第 b 个 bit
#define FLIP_BIT(_ar, _b) do { \
    u8* _arf = (u8*)(_ar); \
    u32 _bf = (_b); \
    _arf[(_bf) >> 3] ^= (128 >> ((_bf) & 7)); \
  } while (0)

  /* Single walking bit. */

  stage_short = "flip1";
  stage_max   = len << 3;
  stage_name  = "bitflip 1/1";

  stage_val_type = STAGE_VAL_NONE;

  orig_hit_cnt = queued_paths + unique_crashes;

  prev_cksum = queue_cur->exec_cksum;

  for (stage_cur = 0; stage_cur < stage_max; stage_cur++) {

    stage_cur_byte = stage_cur >> 3;

    FLIP_BIT(out_buf, stage_cur);

    if (common_fuzz_stuff(argv, out_buf, len)) goto abandon_entry;

    FLIP_BIT(out_buf, stage_cur);

    /* While flipping the least significant bit in every byte, pull of an extra
       trick to detect possible syntax tokens. In essence, the idea is that if
       you have a binary blob like this:

       xxxxxxxxIHDRxxxxxxxx

       ...and changing the leading and trailing bytes causes variable or no
       changes in program flow, but touching any character in the "IHDR" string
       always produces the same, distinctive path, it's highly likely that
       "IHDR" is an atomically-checked magic value of special significance to
       the fuzzed format.

       We do this here, rather than as a separate stage, because it's a nice
       way to keep the operation approximately "free" (i.e., no extra execs).
       
       Empirically, performing the check when flipping the least significant bit
       is advantageous, compared to doing it at the time of more disruptive
       changes, where the program flow may be affected in more violent ways.

       The caveat is that we won't generate dictionaries in the -d mode or -S
       mode - but that's probably a fair trade-off.

       This won't work particularly well with paths that exhibit variable
       behavior, but fails gracefully, so we'll carry out the checks anyway.

      */

    if (!dumb_mode && (stage_cur & 7) == 7) {

      u32 cksum = hash32(trace_bits, MAP_SIZE, HASH_CONST);

      if (stage_cur == stage_max - 1 && cksum == prev_cksum) {

        /* If at end of file and we are still collecting a string, grab the
           final character and force output. */

        if (a_len < MAX_AUTO_EXTRA) a_collect[a_len] = out_buf[stage_cur >> 3];
        a_len++;

        if (a_len >= MIN_AUTO_EXTRA && a_len <= MAX_AUTO_EXTRA)
          maybe_add_auto(a_collect, a_len);

      } else if (cksum != prev_cksum) {

        /* Otherwise, if the checksum has changed, see if we have something
           worthwhile queued up, and collect that if the answer is yes. */

        if (a_len >= MIN_AUTO_EXTRA && a_len <= MAX_AUTO_EXTRA)
          maybe_add_auto(a_collect, a_len);

        a_len = 0;
        prev_cksum = cksum;

      }

      /* Continue collecting string, but only if the bit flip actually made
         any difference - we don't want no-op tokens. */

      if (cksum != queue_cur->exec_cksum) {

        if (a_len < MAX_AUTO_EXTRA) a_collect[a_len] = out_buf[stage_cur >> 3];        
        a_len++;

      }

    }

  }

  new_hit_cnt = queued_paths + unique_crashes;

  stage_finds[STAGE_FLIP1]  += new_hit_cnt - orig_hit_cnt;
  stage_cycles[STAGE_FLIP1] += stage_max;

　　主要的逻辑就是翻转用例的每一个 bit，调用 common_fuzz_stuff() 去实验。但这里额外做了一件事——自动寻找 extra，构建词典。

💡

如果翻转一个字节的 LSB，发现程序行为与原用例不同，则这个字节可能属于一个 extra token。如果翻转这个字节的 LSB，程序行为与翻转前一个 LSB 的行为不同，则说明这里是 token 的分界点。

　　读到这里，我们有两个函数需要跟进——一个是 maybe_add_auto 函数，它负责将一个字符串加入词典；另一个函数是 common_fuzz_stuff，它负责执行一次实验。先看逻辑比较简单的 common_fuzz_stuff ：

/* Write a modified test case, run program, process results. Handle
   error conditions, returning 1 if it's time to bail out. This is
   a helper function for fuzz_one(). */

EXP_ST u8 common_fuzz_stuff(char** argv, u8* out_buf, u32 len) {

  u8 fault;

  if (post_handler) {
    // 前一篇文章提到过的 post handler 的调用位置

    out_buf = post_handler(out_buf, &len);
    if (!out_buf || !len) return 0;
  }

  // 把用例写入 out_file
  write_to_testcase(out_buf, len);

  // 实验
  fault = run_target(argv, exec_tmout);

  if (stop_soon) return 1;

  // subseq_tmouts 自增。若超过 250，说明对这个用例的 fuzz 过于耗时，放弃 fuzz
  if (fault == FAULT_TMOUT) {
    if (subseq_tmouts++ > TMOUT_LIMIT) {
      cur_skipped_paths++;
      return 1;
    }
  } else subseq_tmouts = 0;

  /* Users can hit us with SIGUSR1 to request the current input
     to be abandoned. */

  // 若收到前一篇文章提过的 SIGUSR1，则放弃当前用例的 fuzz
  if (skip_requested) {
     skip_requested = 0;
     cur_skipped_paths++;
     return 1;
  }

  /* This handles FAULT_ERROR for us: */
  
  // 判断是否有趣并加入队列
  queued_discovered += save_if_interesting(argv, out_buf, len, fault);

  // 刷新 ui
  if (!(stage_cur % stats_update_freq) || stage_cur + 1 == stage_max)
    show_stats();

  return 0;

}

　　它的流程是将变异出的用例写入 out_file 文件、调用 run_target() 执行实验、调用 save_if_interesting() 以考虑将这个用例保存下来。除此之外，它会维护一个计数器 subseq_tmouts，这是全局变量，表示当前这个用例的 fuzz 过程中超时了多少次。如果连续超时 250 次，则认为继续变异下去也会超时，于是放弃 fuzz 这个用例。

　　现在来跟进 maybe_add_auto 函数。

/* Maybe add automatic extra. */

static void maybe_add_auto(u8* mem, u32 len) {
  // 将 mem 加入词典
  u32 i;

  /* Allow users to specify that they don't want auto dictionaries. */
  // 如果用户不想使用词典，则直接返回
  if (!MAX_AUTO_EXTRAS || !USE_AUTO_EXTRAS) return;

  /* Skip runs of identical bytes. */

  // 如果 token 所有字符都相同，则不加入字典
  for (i = 1; i < len; i++)
    if (mem[0] ^ mem[i]) break;

  if (i == len) return;

  /* Reject builtin interesting values. */

  // 如果与内置的 interesting 表重复，则放弃。这里考虑了大小端
  if (len == 2) {

    i = sizeof(interesting_16) >> 1;

    while (i--) 
      if (*((u16*)mem) == interesting_16[i] ||
          *((u16*)mem) == SWAP16(interesting_16[i])) return;

  }

  if (len == 4) {

    i = sizeof(interesting_32) >> 2;

    while (i--) 
      if (*((u32*)mem) == interesting_32[i] ||
          *((u32*)mem) == SWAP32(interesting_32[i])) return;

  }

  /* Reject anything that matches existing extras. Do a case-insensitive
     match. We optimize by exploiting the fact that extras[] are sorted
     by size. */

  // 如果 token 已经存在于用户所提供的词典（即 extras 数组），则放弃。比较时忽略大小写
  
  // 注意到 extras 数组是按 len 递增的，因此这样写没问题
  for (i = 0; i < extras_cnt; i++)
    if (extras[i].len >= len) break;

  for (; i < extras_cnt && extras[i].len == len; i++)
    if (!memcmp_nocase(extras[i].data, mem, len)) return;

  /* Last but not least, check a_extras[] for matches. There are no
     guarantees of a particular sort order. */


  // 与自动发现的 extra（即 a_extras 数组）对比，去重
  auto_changed = 1;

  for (i = 0; i < a_extras_cnt; i++) {
    if (a_extras[i].len == len && !memcmp_nocase(a_extras[i].data, mem, len)) {
      // 若重复，则增加 hit_cnt
      a_extras[i].hit_cnt++;
      goto sort_a_extras;
    }
  }

  /* At this point, looks like we're dealing with a new entry. So, let's
     append it if we have room. Otherwise, let's randomly evict some other
     entry from the bottom half of the list. */

  // 全新的 extra token，插入到 a_extras 中

  // 若 a_extras 数量小于 500，则插入
  if (a_extras_cnt < MAX_AUTO_EXTRAS) {

    a_extras = ck_realloc_block(a_extras, (a_extras_cnt + 1) *
                                sizeof(struct extra_data));

    a_extras[a_extras_cnt].data = ck_memdup(mem, len);
    a_extras[a_extras_cnt].len  = len;
    a_extras_cnt++;

  } else {
    // 随机选一个 250 ~ 499 之间的 token，将其驱逐，替换为新来的 token
    i = MAX_AUTO_EXTRAS / 2 +
        UR((MAX_AUTO_EXTRAS + 1) / 2);

    ck_free(a_extras[i].data);

    a_extras[i].data    = ck_memdup(mem, len);
    a_extras[i].len     = len;
    a_extras[i].hit_cnt = 0;
  }

sort_a_extras:

  /* First, sort all auto extras by use count, descending order. */
  // 按 hit_cnt 排序
  qsort(a_extras, a_extras_cnt, sizeof(struct extra_data),
        compare_extras_use_d);

  /* Then, sort the top USE_AUTO_EXTRAS entries by size. */

  // 将 a_extras 的前 50 名按 len 排序
  qsort(a_extras, MIN(USE_AUTO_EXTRAS, a_extras_cnt),
        sizeof(struct extra_data), compare_extras_len);

}

　　可见，afl-fuzz 中有两个词典：一个是程序运行之初导入的用户词典，存放在 extras 数组；另一个是 fuzzer 自己发现的 extra token 组成的词典，放在 a_extras 数组。后者至多 500 个，如果超过了，则随机驱逐一个排名 250 以后的 token。

　　现在我们继续看 fuzz_one。

  /* Two walking bits. */

  stage_name  = "bitflip 2/1";
  stage_short = "flip2";
  stage_max   = (len << 3) - 1;

  orig_hit_cnt = new_hit_cnt;

  for (stage_cur = 0; stage_cur < stage_max; stage_cur++) {

    stage_cur_byte = stage_cur >> 3;

    FLIP_BIT(out_buf, stage_cur);
    FLIP_BIT(out_buf, stage_cur + 1);

    if (common_fuzz_stuff(argv, out_buf, len)) goto abandon_entry;

    FLIP_BIT(out_buf, stage_cur);
    FLIP_BIT(out_buf, stage_cur + 1);

  }

  new_hit_cnt = queued_paths + unique_crashes;

  stage_finds[STAGE_FLIP2]  += new_hit_cnt - orig_hit_cnt;
  stage_cycles[STAGE_FLIP2] += stage_max;

　　这是 bitflip 2/1 变异，对于用例中所有的连续 2bit 进行翻转。

💡

bitflip a/b 的意思是翻转连续的 a 个 bit、步长为 b。例如 bitflip 2/1 就是翻转所有连续 2bit，bitflip 8/8 是每隔 8 个 bit 尝试翻转连续 8bit，也就是尝试翻转每一个字节。


  /* Four walking bits. */

  stage_name  = "bitflip 4/1";
  stage_short = "flip4";
  stage_max   = (len << 3) - 3;

  orig_hit_cnt = new_hit_cnt;

  for (stage_cur = 0; stage_cur < stage_max; stage_cur++) {

    stage_cur_byte = stage_cur >> 3;

    FLIP_BIT(out_buf, stage_cur);
    FLIP_BIT(out_buf, stage_cur + 1);
    FLIP_BIT(out_buf, stage_cur + 2);
    FLIP_BIT(out_buf, stage_cur + 3);

    if (common_fuzz_stuff(argv, out_buf, len)) goto abandon_entry;

    FLIP_BIT(out_buf, stage_cur);
    FLIP_BIT(out_buf, stage_cur + 1);
    FLIP_BIT(out_buf, stage_cur + 2);
    FLIP_BIT(out_buf, stage_cur + 3);

  }

  new_hit_cnt = queued_paths + unique_crashes;

  stage_finds[STAGE_FLIP4]  += new_hit_cnt - orig_hit_cnt;
  stage_cycles[STAGE_FLIP4] += stage_max;

　　bitflip 4/1 变异，尝试翻转连续 4bit。


  /* Effector map setup. These macros calculate:

     EFF_APOS      - position of a particular file offset in the map.
     EFF_ALEN      - length of a map with a particular number of bytes.
     EFF_SPAN_ALEN - map span for a sequence of bytes.

   */

// EFF_MAP_SCALE2 被定义为 3
#define EFF_APOS(_p)          ((_p) >> EFF_MAP_SCALE2)
#define EFF_REM(_x)           ((_x) & ((1 << EFF_MAP_SCALE2) - 1))
#define EFF_ALEN(_l)          (EFF_APOS(_l) + !!EFF_REM(_l))
#define EFF_SPAN_ALEN(_p, _l) (EFF_APOS((_p) + (_l) - 1) - EFF_APOS(_p) + 1)

  /* Initialize effector map for the next step (see comments below). Always
     flag first and last byte as doing something. */

  eff_map    = ck_alloc(EFF_ALEN(len));
  eff_map[0] = 1;

  if (EFF_APOS(len - 1) != 0) {
    eff_map[EFF_APOS(len - 1)] = 1;
    eff_cnt++;
  }

  /* Walking byte. */

  stage_name  = "bitflip 8/8";
  stage_short = "flip8";
  stage_max   = len;

  orig_hit_cnt = new_hit_cnt;

  for (stage_cur = 0; stage_cur < stage_max; stage_cur++) {

    stage_cur_byte = stage_cur;

    out_buf[stage_cur] ^= 0xFF;

    if (common_fuzz_stuff(argv, out_buf, len)) goto abandon_entry;

    /* We also use this stage to pull off a simple trick: we identify
       bytes that seem to have no effect on the current execution path
       even when fully flipped - and we skip them during more expensive
       deterministic stages, such as arithmetics or known ints. */

    // 接下来要顺路干与 afl-analyze 类似的事情，确定每个字节位置的敏感性
    // 如果一个位置不敏感，则不对它进行比较耗时的确定性变异（如 arith）

    if (!eff_map[EFF_APOS(stage_cur)]) {

      u32 cksum;

      /* If in dumb mode or if the file is very short, just flag everything
         without wasting time on checksums. */

      if (!dumb_mode && len >= EFF_MIN_LEN)
        cksum = hash32(trace_bits, MAP_SIZE, HASH_CONST);
      else
        cksum = ~queue_cur->exec_cksum;

      // 如果翻转这个字节会导致程序行为变化，则标记为敏感
      if (cksum != queue_cur->exec_cksum) {
        eff_map[EFF_APOS(stage_cur)] = 1;
        eff_cnt++;
      }

    }

    out_buf[stage_cur] ^= 0xFF;

  }

  /* If the effector map is more than EFF_MAX_PERC dense, just flag the
     whole thing as worth fuzzing, since we wouldn't be saving much time
     anyway. */

  // 若超过 90% 的位置都敏感，则干脆认为所有位置全都敏感
  if (eff_cnt != EFF_ALEN(len) &&
      eff_cnt * 100 / EFF_ALEN(len) > EFF_MAX_PERC) {

    memset(eff_map, 1, EFF_ALEN(len));

    blocks_eff_select += EFF_ALEN(len);

  } else {
    blocks_eff_select += eff_cnt;
  }

  blocks_eff_total += EFF_ALEN(len);

  new_hit_cnt = queued_paths + unique_crashes;

  stage_finds[STAGE_FLIP8]  += new_hit_cnt - orig_hit_cnt;
  stage_cycles[STAGE_FLIP8] += stage_max;

　　bitflip 8/8，尝试翻转每个字节。在这个过程中，顺路做了敏感度分析。这个敏感度分析也比我们分析过的 afl-analyze 要粗糙，只要翻转后的执行路径与原始用例不同，就认为这个位置敏感（准确地说，认为这个位置所处的 8 字节的块敏感）。如果一个位置不敏感（对应 afl-analyze 中的「no-op」），那我们就不需要对那个位置做一些比较耗时的确定性变异。


  /* Two walking bytes. */

  if (len < 2) goto skip_bitflip;

  stage_name  = "bitflip 16/8";
  stage_short = "flip16";
  stage_cur   = 0;
  stage_max   = len - 1;

  orig_hit_cnt = new_hit_cnt;

  for (i = 0; i < len - 1; i++) {

    /* Let's consult the effector map... */

    if (!eff_map[EFF_APOS(i)] && !eff_map[EFF_APOS(i + 1)]) {
      stage_max--;
      continue;
    }

    stage_cur_byte = i;

    *(u16*)(out_buf + i) ^= 0xFFFF;

    if (common_fuzz_stuff(argv, out_buf, len)) goto abandon_entry;
    stage_cur++;

    *(u16*)(out_buf + i) ^= 0xFFFF;


  }

  new_hit_cnt = queued_paths + unique_crashes;

  stage_finds[STAGE_FLIP16]  += new_hit_cnt - orig_hit_cnt;
  stage_cycles[STAGE_FLIP16] += stage_max;

　　这是 bitflip 16/8，尝试翻转所有连续的 2 字节。它使用到了刚刚构建的敏感度信息——若当前位置不敏感，则直接放弃翻转，节省一次实验。

  // 若用例长度小于 4，跳出 bitflip 变异
  if (len < 4) goto skip_bitflip;

  /* Four walking bytes. */

  stage_name  = "bitflip 32/8";
  stage_short = "flip32";
  stage_cur   = 0;
  stage_max   = len - 3;

  orig_hit_cnt = new_hit_cnt;

  for (i = 0; i < len - 3; i++) {

    /* Let's consult the effector map... */
    if (!eff_map[EFF_APOS(i)] && !eff_map[EFF_APOS(i + 1)] &&
        !eff_map[EFF_APOS(i + 2)] && !eff_map[EFF_APOS(i + 3)]) {
      stage_max--;
      continue;
    }

    stage_cur_byte = i;

    *(u32*)(out_buf + i) ^= 0xFFFFFFFF;

    if (common_fuzz_stuff(argv, out_buf, len)) goto abandon_entry;
    stage_cur++;

    *(u32*)(out_buf + i) ^= 0xFFFFFFFF;

  }

  new_hit_cnt = queued_paths + unique_crashes;

  stage_finds[STAGE_FLIP32]  += new_hit_cnt - orig_hit_cnt;
  stage_cycles[STAGE_FLIP32] += stage_max;

　　这是 bitflip 32/8，试图翻转所有连续的 uint32。以上是所有的 bitflip 变异，接下来进入 arith 变异。


  // no_arith 是环境变量 AFL_NO_ARITH 设置的
  if (no_arith) goto skip_arith;

  /**********************
   * ARITHMETIC INC/DEC *
   **********************/

  /* 8-bit arithmetics. */

  stage_name  = "arith 8/8";
  stage_short = "arith8";
  stage_cur   = 0;
  stage_max   = 2 * len * ARITH_MAX;

  stage_val_type = STAGE_VAL_LE;

  orig_hit_cnt = new_hit_cnt;

  for (i = 0; i < len; i++) {

    u8 orig = out_buf[i];

    /* Let's consult the effector map... */

    // 若位置不敏感，则跳过
    if (!eff_map[EFF_APOS(i)]) {
      stage_max -= 2 * ARITH_MAX;
      continue;
    }

    stage_cur_byte = i;

    // ARITH_MAX 被定义为 35
    for (j = 1; j <= ARITH_MAX; j++) {
      // 考虑给这个字节 + j

      // 只有当 orig + j 不可能被 bitflip 覆盖过时，才进行实验
      u8 r = orig ^ (orig + j);

      /* Do arithmetic operations only if the result couldn't be a product
         of a bitflip. */

      if (!could_be_bitflip(r)) {

        stage_cur_val = j;
        out_buf[i] = orig + j;

        if (common_fuzz_stuff(argv, out_buf, len)) goto abandon_entry;
        stage_cur++;

      } else stage_max--;

      // 实验 -j
      r =  orig ^ (orig - j);

      if (!could_be_bitflip(r)) {

        stage_cur_val = -j;
        out_buf[i] = orig - j;

        if (common_fuzz_stuff(argv, out_buf, len)) goto abandon_entry;
        stage_cur++;

      } else stage_max--;

      out_buf[i] = orig;

    }

  }

  new_hit_cnt = queued_paths + unique_crashes;

  stage_finds[STAGE_ARITH8]  += new_hit_cnt - orig_hit_cnt;
  stage_cycles[STAGE_ARITH8] += stage_max;

  /* 16-bit arithmetics, both endians. */
  // 下面是 16bit 的 arith，大致与 8bit 的相同。但分别考虑了大小端
  // ... 略

  /* 32-bit arithmetics, both endians. */
  // 下面是 32bit 的 arith
  // ... 略

  new_hit_cnt = queued_paths + unique_crashes;

  stage_finds[STAGE_ARITH32]  += new_hit_cnt - orig_hit_cnt;
  stage_cycles[STAGE_ARITH32] += stage_max;

　　从代码中可以看到，arith 变异就是给每个 uint8、uint16、uint32 加上和减去一个量（1 到 35 之间），进行实验。另外，如果 bitflip 已经覆盖到了，则不重复实验。由于 arith 变异对每个位置要尝试大约 $35\times 3$ 次实验，耗时很长。

　　接下来是 interesting values 变异。


  /**********************
   * INTERESTING VALUES *
   **********************/

  stage_name  = "interest 8/8";
  stage_short = "int8";
  stage_cur   = 0;
  stage_max   = len * sizeof(interesting_8);

  stage_val_type = STAGE_VAL_LE;

  orig_hit_cnt = new_hit_cnt;

  /* Setting 8-bit integers. */

  for (i = 0; i < len; i++) {

    u8 orig = out_buf[i];

    /* Let's consult the effector map... */

    if (!eff_map[EFF_APOS(i)]) {
      stage_max -= sizeof(interesting_8);
      continue;
    }

    stage_cur_byte = i;

    for (j = 0; j < sizeof(interesting_8); j++) {

      /* Skip if the value could be a product of bitflips or arithmetics. */

      if (could_be_bitflip(orig ^ (u8)interesting_8[j]) ||
          could_be_arith(orig, (u8)interesting_8[j], 1)) {
        stage_max--;
        continue;
      }

      stage_cur_val = interesting_8[j];
      out_buf[i] = interesting_8[j];

      if (common_fuzz_stuff(argv, out_buf, len)) goto abandon_entry;

      out_buf[i] = orig;
      stage_cur++;

    }

  }

  new_hit_cnt = queued_paths + unique_crashes;

  stage_finds[STAGE_INTEREST8]  += new_hit_cnt - orig_hit_cnt;
  stage_cycles[STAGE_INTEREST8] += stage_max;

  /* Setting 16-bit integers, both endians. */
  // ... 略
  
  /* Setting 32-bit integers, both endians. */
  // ... 略

　　interest 8/8 就是对于每个字节，把它替换成有趣的值。8bit 的有趣的值包括： -128, -1, 0, 1, 16, 32, 64, 100, 127。不敏感的位置不参与这个变异，被 bitflip 和 arith 覆盖过的也不再重复实验。

　　interest 16/8 和 32/8 大致逻辑与 8/8 相同，但大小端都会尝试。16bit 的有趣值，是在 8bit 有趣值的基础上添加 -32768, -129, 128, 255, 256, 512, 1000, 1024, 4096, 32767。32bit 的有趣值是在 8bit、16bit 的基础上添加 -2147483648LL, -100663046, -32769, 32768, 65535, 65536, 100663045, 2147483647。

　　interesting value 变异结束后，开始使用词典。


  /********************
   * DICTIONARY STUFF *
   ********************/

  if (!extras_cnt) goto skip_user_extras;

  /* Overwrite with user-supplied extras. */

  stage_name  = "user extras (over)";
  stage_short = "ext_UO";
  stage_cur   = 0;
  stage_max   = extras_cnt * len;

  stage_val_type = STAGE_VAL_NONE;

  orig_hit_cnt = new_hit_cnt;

  for (i = 0; i < len; i++) {

    u32 last_len = 0;

    stage_cur_byte = i;

    /* Extras are sorted by size, from smallest to largest. This means
       that we don't have to worry about restoring the buffer in
       between writes at a particular offset determined by the outer
       loop. */

    for (j = 0; j < extras_cnt; j++) {

      /* Skip extras probabilistically if extras_cnt > MAX_DET_EXTRAS. Also
         skip them if there's no room to insert the payload, if the token
         is redundant, or if its entire span has no bytes set in the effector
         map. */

      // 若 extras 超过 200 个，则概率性地放弃
      if ((extras_cnt > MAX_DET_EXTRAS && UR(extras_cnt) >= MAX_DET_EXTRAS) ||
          extras[j].len > len - i ||
          !memcmp(extras[j].data, out_buf + i, extras[j].len) ||
          !memchr(eff_map + EFF_APOS(i), 1, EFF_SPAN_ALEN(i, extras[j].len))) {

        stage_max--;
        continue;

      }

      last_len = extras[j].len;
      memcpy(out_buf + i, extras[j].data, last_len);

      if (common_fuzz_stuff(argv, out_buf, len)) goto abandon_entry;

      stage_cur++;

    }

    /* Restore all the clobbered memory. */
    memcpy(out_buf + i, in_buf + i, last_len);

  }

  new_hit_cnt = queued_paths + unique_crashes;

  stage_finds[STAGE_EXTRAS_UO]  += new_hit_cnt - orig_hit_cnt;
  stage_cycles[STAGE_EXTRAS_UO] += stage_max;

　　以上是 user extras (over) 变异，对于每一个位置，尝试将那里替换为用户词典中的每一个元素。


  /* Insertion of user-supplied extras. */

  stage_name  = "user extras (insert)";
  stage_short = "ext_UI";
  stage_cur   = 0;
  stage_max   = extras_cnt * (len + 1);

  orig_hit_cnt = new_hit_cnt;

  ex_tmp = ck_alloc(len + MAX_DICT_FILE);

  for (i = 0; i <= len; i++) {

    stage_cur_byte = i;

    for (j = 0; j < extras_cnt; j++) {
      // MAX_FILE = 1024 * 1024，即 1024 KB
      if (len + extras[j].len > MAX_FILE) {
        stage_max--; 
        continue;
      }

      /* Insert token */
      memcpy(ex_tmp + i, extras[j].data, extras[j].len);

      /* Copy tail */
      memcpy(ex_tmp + i + extras[j].len, out_buf + i, len - i);

      if (common_fuzz_stuff(argv, ex_tmp, len + extras[j].len)) {
        ck_free(ex_tmp);
        goto abandon_entry;
      }

      stage_cur++;

    }

    /* Copy head */
    ex_tmp[i] = out_buf[i];

  }

  ck_free(ex_tmp);

  new_hit_cnt = queued_paths + unique_crashes;

  stage_finds[STAGE_EXTRAS_UI]  += new_hit_cnt - orig_hit_cnt;
  stage_cycles[STAGE_EXTRAS_UI] += stage_max;

　　这是 user extras (insert) 变异，尝试在每个位置插入每个词典元素。以上，处理完了用户提供的词典，接下来开始处理 fuzzer 自动发现的词典。


  stage_name  = "auto extras (over)";
  stage_short = "ext_AO";
  stage_cur   = 0;
  stage_max   = MIN(a_extras_cnt, USE_AUTO_EXTRAS) * len;

  stage_val_type = STAGE_VAL_NONE;

  orig_hit_cnt = new_hit_cnt;

  for (i = 0; i < len; i++) {

    u32 last_len = 0;

    stage_cur_byte = i;

    // USE_AUTO_EXTRAS 被定义为 50
    for (j = 0; j < MIN(a_extras_cnt, USE_AUTO_EXTRAS); j++) {

      /* See the comment in the earlier code; extras are sorted by size. */

      if (a_extras[j].len > len - i ||
          !memcmp(a_extras[j].data, out_buf + i, a_extras[j].len) ||
          !memchr(eff_map + EFF_APOS(i), 1, EFF_SPAN_ALEN(i, a_extras[j].len))) {

        stage_max--;
        continue;

      }

      last_len = a_extras[j].len;
      memcpy(out_buf + i, a_extras[j].data, last_len);

      if (common_fuzz_stuff(argv, out_buf, len)) goto abandon_entry;

      stage_cur++;

    }

    /* Restore all the clobbered memory. */
    memcpy(out_buf + i, in_buf + i, last_len);

  }

  new_hit_cnt = queued_paths + unique_crashes;

  stage_finds[STAGE_EXTRAS_AO]  += new_hit_cnt - orig_hit_cnt;
  stage_cycles[STAGE_EXTRAS_AO] += stage_max;
  
skip_extras:

  /* If we made this to here without jumping to havoc_stage or abandon_entry,
     we're properly done with deterministic steps and can mark it as such
     in the .state/ directory. */

  // 将原始用例标为「已完成 deterministic 阶段」
  if (!queue_cur->passed_det) mark_as_det_done(queue_cur);

　　以上是 auto extras (over) 变异，对于每个位置，尝试将其替换为 auto extra 词典中的前 50 个元素。上文已经分析过，a_extras 数组的前 50 个元素是在所有 auto extra token 中 hit_cnt 次数最多的那些 token，按长度升序排序。按照代码，auto extras 只尝试替换，不会尝试插入。

　　deterministic 阶段到此结束。接下来是 havoc 阶段。

fuzz_one - havoc 变异阶段

　　havoc 阶段做的事情是「随机应用几个变异算子，进行实验」。阅读源码：


  /****************
   * RANDOM HAVOC *
   ****************/

havoc_stage:

  stage_cur_byte = -1;

  /* The havoc stage mutation code is also invoked when splicing files; if the
     splice_cycle variable is set, generate different descriptions and such. */

  if (!splice_cycle) {

    stage_name  = "havoc";
    stage_short = "havoc";

    // 决定 havoc 实验次数
    // 如果刚刚做过 deterministic 阶段，则多 4 倍实验次数

    // perf_score 是对测试用例的打分，跑得越快、覆盖度越高的用例分数会高
    // havoc_div 是 fuzzer dry run 时，观察执行速度得到的。
    // 程序越慢，havoc_div 越高，例如 0-19 execs/sec 时，havoc_div 是 10

    stage_max   = (doing_det ? HAVOC_CYCLES_INIT : HAVOC_CYCLES) *
                  perf_score / havoc_div / 100;
  } else {

    static u8 tmp[32];

    perf_score = orig_perf;

    sprintf(tmp, "splice %u", splice_cycle);
    stage_name  = tmp;
    stage_short = "splice";
    stage_max   = SPLICE_HAVOC * perf_score / havoc_div / 100;

  }

  // havoc 至少实验 16 次
  if (stage_max < HAVOC_MIN) stage_max = HAVOC_MIN;

  temp_len = len;

  orig_hit_cnt = queued_paths + unique_crashes;

  havoc_queued = queued_paths;

  /* We essentially just do several thousand runs (depending on perf_score)
     where we take the input file and make random stacked tweaks. */

  // 执行 havoc 阶段
  for (stage_cur = 0; stage_cur < stage_max; stage_cur++) {

    // 决定组合多少个变异算子。可能的选项是 2、4、8、16、32、64、128
    u32 use_stacking = 1 << (1 + UR(HAVOC_STACK_POW2));

    stage_cur_val = use_stacking;
 
    for (i = 0; i < use_stacking; i++) {

      switch (UR(15 + ((extras_cnt + a_extras_cnt) ? 2 : 0))) {
        // 17 个变异算子
      }

    }

    if (common_fuzz_stuff(argv, out_buf, temp_len))
      goto abandon_entry;

    /* out_buf might have been mangled a bit, so let's restore it to its
       original size and shape. */

    if (temp_len < len) out_buf = ck_realloc(out_buf, len);
    temp_len = len;
    memcpy(out_buf, in_buf, len);

    /* If we're finding new stuff, let's run for a bit longer, limits
       permitting. */

    if (queued_paths != havoc_queued) {

      if (perf_score <= HAVOC_MAX_MULT * 100) {
        stage_max  *= 2;
        perf_score *= 2;
      }

      havoc_queued = queued_paths;

    }

  }

  new_hit_cnt = queued_paths + unique_crashes;

  if (!splice_cycle) {
    stage_finds[STAGE_HAVOC]  += new_hit_cnt - orig_hit_cnt;
    stage_cycles[STAGE_HAVOC] += stage_max;
  } else {
    stage_finds[STAGE_SPLICE]  += new_hit_cnt - orig_hit_cnt;
    stage_cycles[STAGE_SPLICE] += stage_max;
  }

　　可见，havoc 阶段执行多少次实验，是由用例得分、 havoc_div 系数等共同决定的。每次实验，首先随机选择要组合多少个变异算子（选项是 2、4、8、16、32、64、128），然后均匀随机选取变异算子执行。

💡

havoc 阶段中，每个算子被选取的概率是相同的。有许多论文尝试改进变异算子的选取策略，例如 MOPT 和 DARWIN。

　　全部 17 种变异算子如下表：

　　接下来，我们分析这些变异算子：

        case 0:

          /* Flip a single bit somewhere. Spooky! */

          FLIP_BIT(out_buf, UR(temp_len << 3));
          break;

　　算子 0 是随机选择一个 bit 进行翻转。

        case 1: 

          /* Set byte to interesting value. */

          out_buf[UR(temp_len)] = interesting_8[UR(sizeof(interesting_8))];
          break;

　　算子 1 是随机选择一个字节，将其改为 8bit 的 interesting value。

        case 2:

          /* Set word to interesting value, randomly choosing endian. */

          if (temp_len < 2) break;

          if (UR(2)) {

            *(u16*)(out_buf + UR(temp_len - 1)) =
              interesting_16[UR(sizeof(interesting_16) >> 1)];

          } else {

            *(u16*)(out_buf + UR(temp_len - 1)) = SWAP16(
              interesting_16[UR(sizeof(interesting_16) >> 1)]);

          }

          break;

　　算子 2 是随机选择一个 word，将其替换为 16bit 的有趣值（随机选择大小端）。

        case 3:

          /* Set dword to interesting value, randomly choosing endian. */

          if (temp_len < 4) break;

          if (UR(2)) {
  
            *(u32*)(out_buf + UR(temp_len - 3)) =
              interesting_32[UR(sizeof(interesting_32) >> 2)];

          } else {

            *(u32*)(out_buf + UR(temp_len - 3)) = SWAP32(
              interesting_32[UR(sizeof(interesting_32) >> 2)]);

          }

          break;

　　算子 3 是随机选择一个 dword，将其替换为 32bit 有趣值。也是随机选取大小端。

        case 4:

          /* Randomly subtract from byte. */

          // ARITH_MAX 是 35，UR(35) 返回值在 [0, 34] 之间
          // 所以这里是随机减去 1~35 之间的值
          out_buf[UR(temp_len)] -= 1 + UR(ARITH_MAX);
          break;

　　算子 4 是选择一个字节，减去 $[1, 35]$ 之间的随机整数。

        case 5:

          /* Randomly add to byte. */

          out_buf[UR(temp_len)] += 1 + UR(ARITH_MAX);
          break;

　　算子 5 是选择一个字节，加上 $[1, 35]$ 之间的随机整数。

        case 6:

          /* Randomly subtract from word, random endian. */

          if (temp_len < 2) break;

          if (UR(2)) {

            u32 pos = UR(temp_len - 1);

            *(u16*)(out_buf + pos) -= 1 + UR(ARITH_MAX);

          } else {

            u32 pos = UR(temp_len - 1);
            u16 num = 1 + UR(ARITH_MAX);

            *(u16*)(out_buf + pos) =
              SWAP16(SWAP16(*(u16*)(out_buf + pos)) - num);

          }

          break;

　　算子 6 是随机选择一个 word，减去 $[1, 35]$ 之间的随机整数。随机选择大小端。

        case 7:

          /* Randomly add to word, random endian. */

          if (temp_len < 2) break;

          if (UR(2)) {

            u32 pos = UR(temp_len - 1);

            *(u16*)(out_buf + pos) += 1 + UR(ARITH_MAX);

          } else {

            u32 pos = UR(temp_len - 1);
            u16 num = 1 + UR(ARITH_MAX);

            *(u16*)(out_buf + pos) =
              SWAP16(SWAP16(*(u16*)(out_buf + pos)) + num);

          }

          break;

　　算子 7 是随机选择一个 word，加上 $[1, 35]$ 之间的随机整数。随机选择大小端。

        case 8:

          /* Randomly subtract from dword, random endian. */

          if (temp_len < 4) break;

          if (UR(2)) {

            u32 pos = UR(temp_len - 3);

            *(u32*)(out_buf + pos) -= 1 + UR(ARITH_MAX);

          } else {

            u32 pos = UR(temp_len - 3);
            u32 num = 1 + UR(ARITH_MAX);

            *(u32*)(out_buf + pos) =
              SWAP32(SWAP32(*(u32*)(out_buf + pos)) - num);

          }

          break;

　　算子 8 是随机选择一个 dword，减去 $[1, 35]$ 之间的随机整数。随机选择大小端。

        case 9:

          /* Randomly add to dword, random endian. */

          if (temp_len < 4) break;

          if (UR(2)) {

            u32 pos = UR(temp_len - 3);

            *(u32*)(out_buf + pos) += 1 + UR(ARITH_MAX);

          } else {

            u32 pos = UR(temp_len - 3);
            u32 num = 1 + UR(ARITH_MAX);

            *(u32*)(out_buf + pos) =
              SWAP32(SWAP32(*(u32*)(out_buf + pos)) + num);

          }

          break;

　　算子 9 是随机选择一个 dword，加上 $[1, 35]$ 之间的随机整数。随机选择大小端。

        case 10:

          /* Just set a random byte to a random value. Because,
             why not. We use XOR with 1-255 to eliminate the
             possibility of a no-op. */

          out_buf[UR(temp_len)] ^= 1 + UR(255);
          break;

　　算子 10 是随机选择一个字节，将其改为随机数。

        case 11 ... 12: {

            /* Delete bytes. We're making this a bit more likely
               than insertion (the next option) in hopes of keeping
               files reasonably small. */

            u32 del_from, del_len;

            if (temp_len < 2) break;

            /* Don't delete too much. */

            del_len = choose_block_len(temp_len - 1);

            del_from = UR(temp_len - del_len + 1);

            memmove(out_buf + del_from, out_buf + del_from + del_len,
                    temp_len - del_from - del_len);

            temp_len -= del_len;

            break;

          }

　　算子 11 和 12 都是随机选择一个位置，删掉之后的随机长度的内容。这里占用 2 个算子，使得「随机删除」这个操作的执行概率是其他算子的两倍，也算是贯彻了 AFL 的「数据越短，执行越快」哲学。

        case 13:

          if (temp_len + HAVOC_BLK_XL < MAX_FILE) {

            /* Clone bytes (75%) or insert a block of constant bytes (25%). */

            u8  actually_clone = UR(4);
            u32 clone_from, clone_to, clone_len;
            u8* new_buf;

            if (actually_clone) {

              clone_len  = choose_block_len(temp_len);
              clone_from = UR(temp_len - clone_len + 1);

            } else {

              clone_len = choose_block_len(HAVOC_BLK_XL);
              clone_from = 0;

            }

            clone_to   = UR(temp_len);

            new_buf = ck_alloc_nozero(temp_len + clone_len);

            /* Head */

            memcpy(new_buf, out_buf, clone_to);

            /* Inserted part */

            if (actually_clone)
              memcpy(new_buf + clone_to, out_buf + clone_from, clone_len);
            else
              memset(new_buf + clone_to,
                     UR(2) ? UR(256) : out_buf[UR(temp_len)], clone_len);

            /* Tail */
            memcpy(new_buf + clone_to + clone_len, out_buf + clone_to,
                   temp_len - clone_to);

            ck_free(out_buf);
            out_buf = new_buf;
            temp_len += clone_len;

          }

          break;

　　算子 13，随机插入块。在 75% 的概率下，将用例的随机一部分复制，插入到随机位置。在 25% 的概率下，插入一个随机长度的块，这个块的每个字节都是相同的。

        case 14: {

            /* Overwrite bytes with a randomly selected chunk (75%) or fixed
               bytes (25%). */

            u32 copy_from, copy_to, copy_len;

            if (temp_len < 2) break;

            copy_len  = choose_block_len(temp_len - 1);

            copy_from = UR(temp_len - copy_len + 1);
            copy_to   = UR(temp_len - copy_len + 1);

            if (UR(4)) {

              if (copy_from != copy_to)
                memmove(out_buf + copy_to, out_buf + copy_from, copy_len);

            } else memset(out_buf + copy_to,
                          UR(2) ? UR(256) : out_buf[UR(temp_len)], copy_len);

            break;

          }

　　算子 14，随机覆写块。在 75% 的概率下，选择一个随机部分，将其覆写到随机位置。在 25% 的概率下，选择一个随机部分，将其覆写为同一个字节。

        /* Values 15 and 16 can be selected only if there are any extras
           present in the dictionaries. */

        case 15: {

            /* Overwrite bytes with an extra. */

            if (!extras_cnt || (a_extras_cnt && UR(2))) {

              /* No user-specified extras or odds in our favor. Let's use an
                 auto-detected one. */

              u32 use_extra = UR(a_extras_cnt);
              u32 extra_len = a_extras[use_extra].len;
              u32 insert_at;

              if (extra_len > temp_len) break;

              insert_at = UR(temp_len - extra_len + 1);
              memcpy(out_buf + insert_at, a_extras[use_extra].data, extra_len);

            } else {

              /* No auto extras or odds in our favor. Use the dictionary. */

              u32 use_extra = UR(extras_cnt);
              u32 extra_len = extras[use_extra].len;
              u32 insert_at;

              if (extra_len > temp_len) break;

              insert_at = UR(temp_len - extra_len + 1);
              memcpy(out_buf + insert_at, extras[use_extra].data, extra_len);

            }

            break;

          }

　　算子 15 和算子 16 只有在启用字典的情况下才会考虑。算子 15 是用字典中的随机 token 覆写用例中的随机位置。

        case 16: {

            u32 use_extra, extra_len, insert_at = UR(temp_len + 1);
            u8* new_buf;

            /* Insert an extra. Do the same dice-rolling stuff as for the
               previous case. */

            if (!extras_cnt || (a_extras_cnt && UR(2))) {

              use_extra = UR(a_extras_cnt);
              extra_len = a_extras[use_extra].len;

              if (temp_len + extra_len >= MAX_FILE) break;

              new_buf = ck_alloc_nozero(temp_len + extra_len);

              /* Head */
              memcpy(new_buf, out_buf, insert_at);

              /* Inserted part */
              memcpy(new_buf + insert_at, a_extras[use_extra].data, extra_len);

            } else {

              use_extra = UR(extras_cnt);
              extra_len = extras[use_extra].len;

              if (temp_len + extra_len >= MAX_FILE) break;

              new_buf = ck_alloc_nozero(temp_len + extra_len);

              /* Head */
              memcpy(new_buf, out_buf, insert_at);

              /* Inserted part */
              memcpy(new_buf + insert_at, extras[use_extra].data, extra_len);

            }

            /* Tail */
            memcpy(new_buf + insert_at + extra_len, out_buf + insert_at,
                   temp_len - insert_at);

            ck_free(out_buf);
            out_buf   = new_buf;
            temp_len += extra_len;

            break;

          }

　　算子 16 是把字典中的随机 token，插入到用例中的随机位置。

　　我们终于分析完了全部 17 个 havoc 阶段的变异算子。如果某次实验发现了新成果，那么剩余的 havoc 执行次数会翻倍，来让这个用例再多跑一会，看看能不能继续发现更多成果。

fuzz_one - splicing 变异阶段

　　如果 deterministic 阶段和 havoc 阶段都未产生成果，则执行 splicing 阶段（否则不执行）。

　　splicing 阶段执行「杂交」操作。也就是说，将这个用例的一部分拼接上其他用例的一部分。来看代码：


  /************
   * SPLICING *
   ************/

  /* This is a last-resort strategy triggered by a full round with no findings.
     It takes the current input file, randomly selects another input, and
     splices them together at some offset, then relies on the havoc
     code to mutate that blob. */

retry_splicing:
  
  // SPLICE_CYCLES 被定义为 15
  if (use_splicing && splice_cycle++ < SPLICE_CYCLES &&
      queued_paths > 1 && queue_cur->len > 1) {

    struct queue_entry* target;
    u32 tid, split_at;
    u8* new_buf;
    s32 f_diff, l_diff;

    /* First of all, if we've modified in_buf for havoc, let's clean that
       up... */

    if (in_buf != orig_in) {
      ck_free(in_buf);
      in_buf = orig_in;
      len = queue_cur->len;
    }

    /* Pick a random queue entry and seek to it. Don't splice with yourself. */

    // 随机选择一个杂交对象
    do { tid = UR(queued_paths); } while (tid == current_entry);

    splicing_with = tid;


    // 在链表中跳转，找到目标
    target = queue;
    while (tid >= 100) { target = target->next_100; tid -= 100; }
    while (tid--) target = target->next;


    // 杂交目标要足够长
    /* Make sure that the target has a reasonable length. */
    while (target && (target->len < 2 || target == queue_cur)) {
      target = target->next;
      splicing_with++;
    }

    if (!target) goto retry_splicing;


    // 读入杂交目标
    /* Read the testcase into a new buffer. */
    fd = open(target->fname, O_RDONLY);
    if (fd < 0) PFATAL("Unable to open '%s'", target->fname);

    new_buf = ck_alloc_nozero(target->len);
    ck_read(fd, new_buf, target->len, target->fname);

    close(fd);

    /* Find a suitable splicing location, somewhere between the first and
       the last differing byte. Bail out if the difference is just a single
       byte or so. */

    // 找到两个串第一个、最后一个不同的位置
    locate_diffs(in_buf, new_buf, MIN(len, target->len), &f_diff, &l_diff);

    // 如果两个串差异只有一两个字节，则重新选择
    if (f_diff < 0 || l_diff < 2 || f_diff == l_diff) {
      ck_free(new_buf);
      goto retry_splicing;
    }

    /* Split somewhere between the first and last differing byte. */

    // 分割点，生成一个新用例 A[:split_at] + B[split_at:]
    split_at = f_diff + UR(l_diff - f_diff);

    /* Do the thing. */

    len = target->len;
    memcpy(new_buf, in_buf, split_at);
    in_buf = new_buf;

    ck_free(out_buf);
    out_buf = ck_alloc_nozero(len);
    memcpy(out_buf, in_buf, len);

    // 执行一遍 havoc
    goto havoc_stage;

  }

　　splicing 顶多执行 15 次。在尝试杂交时，首先随机选取一个足够长、且差异足够大的杂交目标，然后随机选择分割点，把本用例的前面一段和杂交目标的后面一段拼接起来。这样会形成一个新的用例，将这个新用例拿去执行 havoc 阶段变异。

　　可以预见，如果真的执行了 splicing 阶段，那它耗费的时间将会比 havoc 阶段长很多（因为 splicing 阶段产生的用例都要跑一遍 havoc 阶段）。

　　以上，我们终于完成了 afl-fuzz.c 源码的阅读。现在，我们几乎读遍了 AFL 项目除了 ui 以外的所有源码。不过，作为 fuzzing 研究者，我们也需要掌握修改 AFL 代码的能力。因此，下一篇文章（也就是完结篇）会用几个例子说明如何在 AFL 的基础上二次开发。

AFL源码阅读（六）：队列、变异、同步

Ruan Xingzhi

Ruan Xingzhi

主循环

cull_queue 函数（精简队列）

sync_fuzzers 函数

save_if_interesting 函数（保存有趣的用例）

add_to_queue 函数（将用例加入队列）

fuzz_one 函数（从一个用例变异出许多用例）

fuzz_one - 准备工作

fuzz_one - deterministic 变异阶段

fuzz_one - havoc 变异阶段

fuzz_one - splicing 变异阶段

AFL 二次开发的若干心得

如何 fuzz argv：以 DISCOUNT 为例

GPicView fuzz 实践

AFL源码阅读（七）：如何修改 AFL

AFL源码阅读（五）：fuzzer启动