TransWikia.com

RFC 1951 "Inflate" (de-compression)

Code Review Asked by George Barwood on November 25, 2021

My exploration of Rust continues. I’m quite pleased with the 2-step table lookup for decoding variable bit-length codes here, for some reason I found this tricky to work out, but now it’s done I like it. InputBitStream has a required "lifetime parameter", it seems these are not given meaningful names, which seems odd, but perhaps that’s just how it is.

/// RFC 1951 inflate ( de-compress ).

pub fn inflate( data: &[u8] ) -> Vec<u8>
{
  let mut input = InputBitStream::new( &data );
  let mut output = Vec::new();
  let _flags = input.get_bits( 16 );
  loop
  {
    let last_block = input.get_bit();
    let block_type = input.get_bits( 2 );
    match block_type
    {
      2 => dyn_block( &mut input, &mut output ),
      1 => fixed_block( &mut input, &mut output ),
      0 => copy_block( &mut input, &mut output ),
      _ => ()
    }
    if last_block != 0 { break; }
  }  
  // Check the checksum.
  input.clear_bits();
  let check_sum = input.get_bits(32) as u32;
  if crate::compress::adler32( &output ) != check_sum { panic!( "Bad checksum" ) }
  output
}

/// Decode block encoded with dynamic Huffman codes.
fn dyn_block( input: &mut InputBitStream, output: &mut Vec<u8> )
{
  let n_lit = 257 + input.get_bits( 5 );
  let n_dist = 1 + input.get_bits( 5 );
  let n_len = 4 + input.get_bits( 4 );

  // The lengths of the main Huffman codes (lit,dist) are themselves decoded by LenDecoder.
  let mut len = LenDecoder::new( n_len, input );
  let lit : BitDecoder = len.get_decoder( n_lit, input );
  let dist : BitDecoder = len.get_decoder( n_dist, input ); 

  loop
  {
    let x : usize = lit.decode( input );
    match x
    {
      0..=255 => output.push( x as u8 ),
      256 => break,
      _ => // LZ77 match code - replicate earlier output.
      {
        let mc = x - 257;
        let length = MATCH_OFF[ mc ] as usize + input.get_bits( MATCH_EXTRA[ mc ] as usize );
        let dc = dist.decode( input );
        let distance = DIST_OFF[ dc ] as usize + input.get_bits( DIST_EXTRA[ dc ] as usize );
        copy( output, distance, length ); 
      }
    }
  }
} // end do_dyn

/// Copy length bytes from output ( at specified distance ) to output.
fn copy( output: &mut Vec<u8>, distance: usize, mut length: usize )
{
  let mut i = output.len() - distance;
  while length > 0
  {
    output.push( output[ i ] );
    i += 1;
    length -= 1;
  }
}

/// Decode length-limited Huffman codes.
// For speed, a lookup table is used to compute symbols from the variable length codes ( rather than reading single bits ).
// To keep the lookup table small, codes longer than PEEK bits are looked up in two operations.
struct BitDecoder
{
  nsym: usize, // The number of symbols.
  bits: Vec<u8>, // The length in bits of the code that represents each symbol.
  maxbits: usize, // The length in bits of the longest code.
  peekbits: usize, // The bit length for the first lookup ( not greater than PEEK ).
  lookup: Vec<usize> // The table used to look up a symbol from a code.
}

/// Maximum number of bits for first lookup.
const PEEK : usize = 8; 

impl BitDecoder
{
  fn new( nsym: usize ) -> BitDecoder
  {
    BitDecoder 
    { 
      nsym,
      bits: vec![0; nsym],
      maxbits: 0,
      peekbits: 0,
      lookup: Vec::new()
    }
  }

  /// The main function : get a decoded symbol from the input bit stream.
  /// Codes of up to PEEK bits are looked up in a single operation.
  /// Codes of more than PEEK bits are looked up in two steps.
  fn decode( &self, input: &mut InputBitStream ) -> usize
  {
    let mut sym = self.lookup[ input.peek( self.peekbits ) ];
    if sym >= self.nsym
    {
      sym = self.lookup[ sym - self.nsym + ( input.peek( self.maxbits ) >> self.peekbits ) ];
    }  
    input.advance( self.bits[ sym ] as usize );
    sym
  }

  fn init_lookup( &mut self )
  {
    let mut max_bits : usize = 0; 
    for bp in &self.bits 
    { 
      let bits = *bp as usize;
      if bits > max_bits { max_bits = bits; } 
    }

    self.maxbits = max_bits;
    self.peekbits = if max_bits > PEEK { PEEK } else { max_bits };
    self.lookup.resize( 1 << self.peekbits, 0 );

    // Code below is from rfc1951 page 7.

    // bl_count is the number of codes of length N, N >= 1.
    let mut bl_count : Vec<usize> = vec![ 0; max_bits + 1 ];

    for sym in 0..self.nsym { bl_count[ self.bits[ sym ] as usize ] += 1; }

    let mut next_code : Vec<usize> = vec![ 0; max_bits + 1 ];
    let mut code = 0; 
    bl_count[ 0 ] = 0;

    for i in 0..max_bits
    {
      code = ( code + bl_count[ i ] ) << 1;
      next_code[ i + 1 ] = code;
    }

    for sym in 0..self.nsym
    {
      let length = self.bits[ sym ] as usize;
      if length != 0
      {
        self.setup_code( sym, length, next_code[ length ] );
        next_code[ length ] += 1;
      }
    }
  }

  fn setup_code( &mut self, sym: usize, len: usize, mut code: usize )
  {
    if len <= self.peekbits
    {
      let diff = self.peekbits - len;
      code <<= diff;
      for i in code..code + (1 << diff)
      {
        // lookup index is reversed to match InputBitStream::peek
        self.lookup[ reverse( i, self.peekbits ) ] = sym;
      }
    } else { // Secondary lookup required      
      let peekbits2 = self.maxbits - self.peekbits;

      // Split code into peekbits portion ( key ) and remainder ( code).
      let diff1 = len - self.peekbits;
      let key = reverse( code >> diff1, self.peekbits );
      code &= ( 1 << diff1 ) - 1;

      // Get the base for the secondary lookup.
      let mut base = self.lookup[ key ];
      if base == 0 // Secondary lookup not yet allocated for this key.
      {
        base = self.lookup.len();
        self.lookup.resize( base + ( 1 << peekbits2 ), 0 );
        self.lookup[ key ] = self.nsym + base;
      } else {
        base -= self.nsym;
      }

      // Set the secondary lookup values.
      let diff = self.maxbits - len;
      code <<= diff;
      for i in code..code + (1<<diff)
      { 
        self.lookup[ base + reverse( i, peekbits2 ) ] = sym;
      }
    }    
  }
} // end impl BitDecoder

/// Decodes an array of lengths, returning a new BitDecoder.  
/// There are special codes for repeats, and repeats of zeros, per RFC 1951 page 13.
struct LenDecoder
{
  plenc: u8, // previous length code ( which can be repeated )
  rep: usize,   // repeat
  bd: BitDecoder
}

impl LenDecoder
{
  fn new(  n_len: usize, input: &mut InputBitStream ) -> LenDecoder
  {
    let mut result = LenDecoder { plenc: 0, rep:0, bd: BitDecoder::new( 19 ) };

    // Read the array of 3-bit code lengths (used to encode the main code lengths ) from input.
    for i in CLEN_ALPHABET.iter().take( n_len )
    { 
      result.bd.bits[ *i as usize ] = input.get_bits(3) as u8; 
    }
    result.bd.init_lookup();
    result
  }

  fn get_decoder( &mut self, nsym: usize, input: &mut InputBitStream ) -> BitDecoder
  {
    let mut result = BitDecoder::new( nsym );
    let bits = &mut result.bits;
    let mut i = 0;
    while self.rep > 0 { bits[ i ] = self.plenc; i += 1; self.rep -= 1; }
    while i < nsym
    { 
      let lenc = self.bd.decode( input ) as u8;
      if lenc < 16 
      {
        bits[ i ] = lenc; 
        i += 1; 
        self.plenc = lenc; 
      } else {
        if lenc == 16 { self.rep = 3 + input.get_bits(2); }
        else if lenc == 17 { self.rep = 3 + input.get_bits(3); self.plenc=0; }
        else if lenc == 18 { self.rep = 11 + input.get_bits(7); self.plenc=0; } 
        while i < nsym && self.rep > 0 { bits[ i ] = self.plenc; i += 1; self.rep -= 1; }
      }
    }
    result.init_lookup();
    result
  }
} // end impl LenDecoder

/// For reading bits from input array of bytes.
struct InputBitStream<'a>
{
  data: &'a [u8], // Input data.
  pos: usize, // Position in input data.
  buf: usize, // Bit buffer.
  got: usize, // Number of bits in buffer.
}

impl <'a> InputBitStream<'a>
{
  fn new( data: &'a [u8] ) -> InputBitStream
  {
    InputBitStream { data, pos: 0, buf: 1, got: 0 }
  } 

  // Get n bits of input ( but do not advance ).
  fn peek( &mut self, n: usize ) -> usize
  {
    while self.got < n
    {
      // Not necessary to check index, considering adler32 checksum is 32 bits.
      self.buf |= ( self.data[ self.pos ] as usize ) << self.got;
      self.pos += 1;
      self.got += 8;
    }
    self.buf & ( ( 1 << n ) - 1 )
  }

  // Advance n bits.
  fn advance( &mut self, n:usize )
  { 
    self.buf >>= n;
    self.got -= n;
  }

  // Get a single bit.
  fn get_bit( &mut self ) -> usize
  {
    if self.got == 0 { self.peek( 1 ); }
    let result = self.buf & 1;
    self.advance( 1 );
    result
  }

  // Get n bits of input.
  fn get_bits( &mut self, n: usize ) -> usize
  { 
    let result = self.peek( n );
    self.advance( n );
    result
  }

  // Get n bits of input, reversed.
  fn get_huff( &mut self, mut n: usize ) -> usize 
  { 
    let mut result = 0; 
    while n > 0
    { 
      result = ( result << 1 ) + self.get_bit(); 
      n -= 1;
    }
    result
  }

  // Discard any buffered bits.
  fn clear_bits( &mut self )
  {
    // Note: this might work right if peeking more than 8 bits.
    self.got = 0;
  }
} // end impl InputBitStream

/// Reverse a string of n bits.
pub fn reverse( mut x:usize, mut n: usize ) -> usize
{ 
  let mut result: usize = 0; 
  while n > 0
  {
    result = ( result << 1 ) | ( x & 1 ); 
    x >>= 1; 
    n -= 1;
  } 
  result
} 

/// Copy uncompressed block to output.
fn copy_block( input: &mut InputBitStream, output: &mut Vec<u8> )
{
  input.clear_bits(); // Discard any bits in the input buffer
  let mut n = input.get_bits( 16 );
  let _n1 = input.get_bits( 16 );
  while n > 0 { output.push( input.data[ input.pos ] ); n -= 1; input.pos += 1; }
}

/// Decode block encoded with fixed (pre-defined) Huffman codes.
fn fixed_block( input: &mut InputBitStream, output: &mut Vec<u8> ) // RFC1951 page 12.
{
  loop
  {
    // 0 to 23 ( 7 bits ) => 256 - 279; 48 - 191 ( 8 bits ) => 0 - 143; 
    // 192 - 199 ( 8 bits ) => 280 - 287; 400..511 ( 9 bits ) => 144 - 255
    let mut x = input.get_huff( 7 ); // Could be optimised. 
    if x <= 23 
    { 
      x += 256; 
    } else {
      x = ( x << 1 ) + input.get_bit();
      if x <= 191 { x -= 48; }
      else if x <= 199 { x += 88; }
      else { x = ( x << 1 ) + input.get_bit() - 256; }
    }

    match x
    {
      0..=255 => { output.push( x as u8 ); }
      256 => { break; } 
      _ => // 257 <= x && x <= 285 
      { 
        x -= 257;
        let length = MATCH_OFF[x] as usize + input.get_bits( MATCH_EXTRA[ x ] as usize );
        let dcode = input.get_huff( 5 );
        let distance = DIST_OFF[dcode] as usize + input.get_bits( DIST_EXTRA[dcode] as usize );
        copy( output, distance, length );
      }
    }
  }
} // end fixed_block

// RFC 1951 constants.

pub static CLEN_ALPHABET : [u8; 19] = [ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 ];

pub static MATCH_OFF : [u16; 30] = [ 3,4,5,6, 7,8,9,10, 11,13,15,17, 19,23,27,31, 35,43,51,59, 
  67,83,99,115,  131,163,195,227, 258, 0xffff ];

pub static MATCH_EXTRA : [u8; 29] = [ 0,0,0,0, 0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4, 5,5,5,5, 0 ];

pub static DIST_OFF : [u16; 30] = [ 1,2,3,4, 5,7,9,13, 17,25,33,49, 65,97,129,193, 257,385,513,769, 
  1025,1537,2049,3073, 4097,6145,8193,12289, 16385,24577 ];

pub static DIST_EXTRA : [u8; 30] = [ 0,0,0,0, 1,1,2,2, 3,3,4,4, 5,5,6,6, 7,7,8,8, 9,9,10,10, 11,11,12,12, 13,13 ];

Add your own answers!

Ask a Question

Get help from others!

© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP