Heap Exploitation - tcache

tcache是glibc2.26（ubuntu17.10）之后引入的一种技术，目的是堆管理的性能，在检测机制上有些变化，也新增了一些利用方式。

TCache全名为Thread Local Caching，它会为每个线程创建一个缓存，里面包含一些小堆块，无需对arena上锁即可使用，提升了分配性能。

数据结构

glibc在编译时使用USE_TCACHE条件开启tcache机制，默认开启

#if USE_TCACHE
/* We want 64 entries.  This is an arbitrary limit, which tunables can reduce.  */
# define TCACHE_MAX_BINS        64
# define MAX_TCACHE_SIZE    tidx2usize (TCACHE_MAX_BINS-1)

/* Only used to pre-fill the tunables.  */
# define tidx2usize(idx)    (((size_t) idx) * MALLOC_ALIGNMENT + MINSIZE - SIZE_SZ)

/* When "x" is from chunksize().  */
# define csize2tidx(x) (((x) - MINSIZE + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT)
/* When "x" is a user-provided size.  */
# define usize2tidx(x) csize2tidx (request2size (x))

/* With rounding and alignment, the bins are...
   idx 0   bytes 0..24 (64-bit) or 0..12 (32-bit)
   idx 1   bytes 25..40 or 13..20
   idx 2   bytes 41..56 or 21..28
   etc.  */

/* This is another arbitrary limit, which tunables can change.  Each
   tcache bin will hold at most this number of chunks.  */
# define TCACHE_FILL_COUNT 7
#endif

每个线程默认使用64个单链表结构的bins，每个bins最多存放7个chunk。在64位系统上以16字节递增，从24到1032字节。32位系统上以8字节递增，从12到512字节。tcache bin只用于存放非large的chunk。

tcache引入了两个新的结构体，tcache_entry和tcache_perthread_struct

typedef struct tcache_entry
{
  struct tcache_entry *next;
} tcache_entry;

typedef struct tcache_perthread_struct
{
  char counts[TCACHE_MAX_BINS];
  tcache_entry *entries[TCACHE_MAX_BINS];
} tcache_perthread_struct;

static __thread tcache_perthread_struct *tcache = NULL;

tcache_perthread_struct位于堆开头的位置，说明其本身也是一个堆块，大小为0x250。它是整个 tcache 的管理结构，一共有 TCACHE_MAX_BINS 个计数器和 TCACHE_MAX_BINS项。

tcache_entry其中包括数组entries用于放置64个bin的地址，数组counts则存放每个bin中的chunk数量，每个被放入bins的chunk都会在其用户数据中包含一个tcache_entry（即fd指针），链接了相同大小的空闲的下一个chunk的用户数据，类似于fast bins，从而构成单链表。需要注意的是这里的 next 指向 chunk 的 user data，而 fastbin 的 fd 指向 chunk 开头的地址。而且，tcache_entry 会复用空闲 chunk 的 user data 部分。

初始化操作

在_int_malloc中，第一次malloc时会进入到MAYBE_INIT_TCACHE ()，其中 MAYBE_INIT_TCACHE () 在 tcache 为空（即第一次 malloc）时调用了 tcache_init()

# define MAYBE_INIT_TCACHE() \
  if (__glibc_unlikely (tcache == NULL)) \
    tcache_init();

#else  /* !USE_TCACHE */
# define MAYBE_INIT_TCACHE()

初始化操作如下：

static void tcache_init(void)
{
  mstate ar_ptr;
  void *victim = 0;
  const size_t bytes = sizeof (tcache_perthread_struct);	//获得malloc需要的字节数

  if (tcache_shutting_down)
    return;

  arena_get (ar_ptr, bytes);
  victim = _int_malloc (ar_ptr, bytes);//使用malloc为该结构分配内存
  if (!victim && ar_ptr != NULL)
    {
      ar_ptr = arena_get_retry (ar_ptr, bytes);
      victim = _int_malloc (ar_ptr, bytes);
    }


  if (ar_ptr != NULL)
    __libc_lock_unlock (ar_ptr->mutex);

  /* In a low memory situation, we may not be able to allocate memory
     - in which case, we just keep trying later.  However, we
     typically do this very early, so either there is sufficient
     memory, or there isn't enough memory to do non-trivial
     allocations anyway.  */
  if (victim)
    {
      tcache = (tcache_perthread_struct *) victim;	//存放
      memset (tcache, 0, sizeof (tcache_perthread_struct));	//清零
    }

}

tcache的tcache_perthread_struct结构是由_int_malloc来分配并通过memset清零来进行初始化的。tcache_init()成功返回后，tcache_perthread_struct 就被成功建立了。

申请内存

向tcache放入chunk

_libc_free中并没有变化，int_free中有一些修改，在fastbin的操作之前执行，当大小符合要求，即为non-large chunk并且对应的bin还未满7个就调用tcache_put将该块放入tcache

static void
_int_free (mstate av, mchunkptr p, int have_lock)
{
  ......
  ......
#if USE_TCACHE
  {
    size_t tc_idx = csize2tidx (size);
    if (tcache
        && tc_idx < mp_.tcache_bins // 64
        && tcache->counts[tc_idx] < mp_.tcache_count) // 7
      {
        tcache_put (p, tc_idx);
        return;
      }
  }
#endif
  ......
  ......

调用_int_malloc时，有三处会触发将chunk放入tcache的操作

fast bin中返回了一个chunk，对应的fast bin中的其他chunk会被放进对应的tcache bin中，直到满或者fast bin中为空为止。需要注意的是，chunks在tcache bin中和fast bins中的顺序相反

#if USE_TCACHE
	  /* While we're here, if we see other chunks of the same size,
	     stash them in the tcache.  */
	  size_t tc_idx = csize2tidx (nb);
	  if (tcache && tc_idx < mp_.tcache_bins)
	    {
	      mchunkptr tc_victim;

	      /* While bin not empty and tcache not full, copy chunks over.  */
	      while (tcache->counts[tc_idx] < mp_.tcache_count
		     && (pp = *fb) != NULL)
		{
		  REMOVE_FB (fb, tc_victim, pp);
		  if (tc_victim != 0)
		    {
		      tcache_put (tc_victim, tc_idx);
	            }
		}
	    }
#endif
          void *p = chunk2mem (victim);
          alloc_perturb (p, bytes);
          return p;
        }
    }

small bins中的情况和fast bins中类似，双链表中的剩余chunk会被填充到tcache bin中，直到small bins空或者tcache bin满为止

#if USE_TCACHE
	  /* While we're here, if we see other chunks of the same size,
	     stash them in the tcache.  */
	  size_t tc_idx = csize2tidx (nb);
	  if (tcache && tc_idx < mp_.tcache_bins)
	    {
	      mchunkptr tc_victim;

	      /* While bin not empty and tcache not full, copy chunks over.  */
	      while (tcache->counts[tc_idx] < mp_.tcache_count
		     && (tc_victim = last (bin)) != bin)
		{
		  if (tc_victim != 0)
		    {
		      bck = tc_victim->bk;
		      set_inuse_bit_at_offset (tc_victim, nb);
		      if (av != &main_arena)
			set_non_main_arena (tc_victim);
		      bin->bk = bck;
		      bck->fd = bin;

		      tcache_put (tc_victim, tc_idx);
	            }
		}
	    }
#endif

在unsorted bin中，当用户所需的chunk大小与unsorted chunk一致，那么首先会将该chunk放入对应的tcache中并跳过该次循环，对于unsorted bin而言，与用户需求大小相等的块会被优先放入tcache中

#if USE_TCACHE
	      /* Fill cache first, return to user only if cache fills.
		 We may return one of these chunks later.  */
	      if (tcache_nb
		  && tcache->counts[tc_idx] < mp_.tcache_count)
		{
		  tcache_put (victim, tc_idx);
		  return_cached = 1;
		  continue;
		}
	      else
		{
#endif

从tcache取出chunk

在_int_malloc之前，当用户需要的chunk大小不为large并且tcache已经完成初始化，并且tcache中有相应的chunk块，调用tcache_get()从tcache取出chunk。注意取出chunk是在fast bins之前，也就是说，tcache是最高一级的缓存措施。

void *
__libc_malloc (size_t bytes)
{
  mstate ar_ptr;
  void *victim;

  void *(*hook) (size_t, const void *)
    = atomic_forced_read (__malloc_hook);
  if (__builtin_expect (hook != NULL, 0))
    return (*hook)(bytes, RETURN_ADDRESS (0));
#if USE_TCACHE
  /* int_free also calls request2size, be careful to not pad twice.  */
  size_t tbytes;
  checked_request2size (bytes, tbytes);
  size_t tc_idx = csize2tidx (tbytes);

  MAYBE_INIT_TCACHE ();

  DIAG_PUSH_NEEDS_COMMENT;
  if (tc_idx < mp_.tcache_bins
      /*&& tc_idx < TCACHE_MAX_BINS*/ /* to appease gcc */
      && tcache
      && tcache->entries[tc_idx] != NULL)
    {
      return tcache_get (tc_idx);
    }
  DIAG_POP_NEEDS_COMMENT;
#endif

在binning code中，如果在tcache中放入的chunk到了上限就会直接返回最后一个chunk，默认情况下是没有限制的。

#if USE_TCACHE
      /* If we've processed as many chunks as we're allowed while
	 filling the cache, return one of the cached ones.  */
      ++tcache_unsorted_count;
      if (return_cached
	  && mp_.tcache_unsorted_limit > 0
	  && tcache_unsorted_count > mp_.tcache_unsorted_limit)
	{
	  return tcache_get (tc_idx);
	}
#endif

bining code结束后，如果没有直接返回，那么如果至少有一个符合要求的chunk被找到，则返回最后一个。

#if USE_TCACHE
      /* If all the small chunks we found ended up cached, return one now.  */
      if (return_cached)
	{
	  return tcache_get (tc_idx);
	}
#endif

另外，tcache中的chunk不会被合并，prev_inuse始终不为0。

安全性分析

函数tcahce_put()和tcache_get()函数分别用于从单链表中放入和取出chunk。

tcache_get()

/* Caller must ensure that we know tc_idx is valid and there's
   available chunks to remove.  */
static __always_inline void *
tcache_get (size_t tc_idx)
{
  tcache_entry *e = tcache->entries[tc_idx];
  assert (tc_idx < TCACHE_MAX_BINS);
  assert (tcache->entries[tc_idx] > 0);
  tcache->entries[tc_idx] = e->next;
  --(tcache->counts[tc_idx]); // 获得一个 chunk，counts 减一
  return (void *) e;
}

从tcache中取出chunk的函数，几乎没有任何保护。

tcahe_put()

/* Caller must ensure that we know tc_idx is valid and there's room
   for more chunks.  */
static __always_inline void
tcache_put (mchunkptr chunk, size_t tc_idx)
{
  tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
  assert (tc_idx < TCACHE_MAX_BINS);
  e->next = tcache->entries[tc_idx];
  tcache->entries[tc_idx] = e;
  ++(tcache->counts[tc_idx]);
}

没有将P位置零并且几乎没有任何保护。

通过以上可以看出这两个函数都假设调用者对参数进行了有效性检查，但是由于tcache的操作在free和malloc都处于很靠前的位置，所以很多的有效性检查没有起到作用，这样做有效地提升了效率，但是安全性降低，反而使得堆利用的方法更多了。

tcache_get()函数中，assert (tcache->entries[tc_idx] > 0);此处本意是检查tcahce bin中chunk的数量是否大于0，但是counts可以发生整数溢出变成负数。该问题在glibc2.28得到修复。

CVE-2017-17426

libc-2.26的tcache机制发现了安全漏洞，由于_libc_malloc()使用了request2size()来将请求大小转换为实际大小，而该函数没有进行整数溢出检查。所以如果请求一个很大的堆块（接近SIZE_MAX），那么就会导致整数溢出，从而导致malloc错误地返回tcache bin中的堆块。

#include <stdio.h>
#include <stdlib.h>
int main() {
    void *x = malloc(10);
    printf("malloc(10): %p\n", x);
    free(x);
    void *y = malloc(((size_t)~0) - 2); // overflow allocation (size_t.max-2)
    printf("malloc(((size_t)~0) - 2): %p\n", y);
}

$ gcc cve201717426.c
$ /usr/local/glibc-2.26/lib/ld-2.26.so ./a.out
malloc(10): 0x7f3f945ed260
malloc(((size_t)~0) - 2): 0x7f3f945ed260
$ /usr/local/glibc-2.27/lib/ld-2.27.so ./a.out
malloc(10): 0x7f399c69e260
malloc(((size_t)~0) - 2): (nil)

使用glibc2.26时，第二次malloc返回了第一次free的堆块，使用glibc2.27时，返回NULL，问题已经被修复。

修复方法，使用cecksed_request2size()函数替换request2size()函数，实现对整数溢出的检查

二次释放检查

libc-2.28增加了对tache中二次释放的检查，方法是在tcache_entry结构中增加一哥标志key，用于表示chunk是否已经在tcache bin中

index 6d7a6a8..f730d7a 100644 (file)
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -2967,6 +2967,8 @@ mremap_chunk (mchunkptr p, size_t new_size)
 typedef struct tcache_entry
 {
   struct tcache_entry *next;
+  /* This field exists to detect double frees.  */
+  struct tcache_perthread_struct *key;
 } tcache_entry;

 /* There is one of these for each thread, which contains the
@@ -2990,6 +2992,11 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
 {
   tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
   assert (tc_idx < TCACHE_MAX_BINS);
+
+  /* Mark this chunk as "in the tcache" so the test in _int_free will
+     detect a double free.  */
+  e->key = tcache;
+
   e->next = tcache->entries[tc_idx];
   tcache->entries[tc_idx] = e;
   ++(tcache->counts[tc_idx]);
@@ -3005,6 +3012,7 @@ tcache_get (size_t tc_idx)
   assert (tcache->entries[tc_idx] > 0);
   tcache->entries[tc_idx] = e->next;
   --(tcache->counts[tc_idx]);
+  e->key = NULL;
   return (void *) e;
 }

@@ -4218,6 +4226,26 @@ _int_free (mstate av, mchunkptr p, int have_lock)
   {
     size_t tc_idx = csize2tidx (size);

+    /* Check to see if it's already in the tcache.  */
+    tcache_entry *e = (tcache_entry *) chunk2mem (p);
+
+    /* This test succeeds on double free.  However, we don't 100%
+       trust it (it also matches random payload data at a 1 in
+       2^<size_t> chance), so verify it's not an unlikely coincidence
+       before aborting.  */
+    if (__glibc_unlikely (e->key == tcache && tcache))
+      {
+       tcache_entry *tmp;
+       LIBC_PROBE (memory_tcache_double_free, 2, e, tc_idx);
+       for (tmp = tcache->entries[tc_idx];
+            tmp;
+            tmp = tmp->next)
+         if (tmp == e)
+           malloc_printerr ("free(): double free detected in tcache 2");
+       /* If we get here, it was a coincidence.  We've wasted a few
+          cycles, but don't abort.  */
+      }
+
     if (tcache
        && tc_idx < mp_.tcache_bins
        && tcache->counts[tc_idx] < mp_.tcache_count)