| 
 | 1 | +use crate::iter::Bytes;  | 
 | 2 | +use core::arch::aarch64::*;  | 
 | 3 | + | 
 | 4 | +#[inline]  | 
 | 5 | +pub fn match_header_name_vectored(bytes: &mut Bytes) {  | 
 | 6 | +    while bytes.as_ref().len() >= 16 {  | 
 | 7 | +        unsafe {  | 
 | 8 | +            let advance = match_header_name_char_16_neon(bytes.as_ref().as_ptr());  | 
 | 9 | +            bytes.advance(advance);  | 
 | 10 | + | 
 | 11 | +            if advance != 16 {  | 
 | 12 | +                return;  | 
 | 13 | +            }  | 
 | 14 | +        }  | 
 | 15 | +    }  | 
 | 16 | +    super::swar::match_header_name_vectored(bytes);  | 
 | 17 | +}  | 
 | 18 | + | 
 | 19 | +#[inline]  | 
 | 20 | +pub fn match_header_value_vectored(bytes: &mut Bytes) {  | 
 | 21 | +    while bytes.as_ref().len() >= 16 {  | 
 | 22 | +        unsafe {  | 
 | 23 | +            let advance = match_header_value_char_16_neon(bytes.as_ref().as_ptr());  | 
 | 24 | +            bytes.advance(advance);  | 
 | 25 | + | 
 | 26 | +            if advance != 16 {  | 
 | 27 | +                return;  | 
 | 28 | +            }  | 
 | 29 | +        }  | 
 | 30 | +    }  | 
 | 31 | +    super::swar::match_header_value_vectored(bytes);  | 
 | 32 | +}  | 
 | 33 | + | 
 | 34 | +#[inline]  | 
 | 35 | +pub fn match_uri_vectored(bytes: &mut Bytes) {  | 
 | 36 | +    while bytes.as_ref().len() >= 16 {  | 
 | 37 | +        unsafe {  | 
 | 38 | +            let advance = match_url_char_16_neon(bytes.as_ref().as_ptr());  | 
 | 39 | +            bytes.advance(advance);  | 
 | 40 | + | 
 | 41 | +            if advance != 16 {  | 
 | 42 | +                return;  | 
 | 43 | +            }  | 
 | 44 | +        }  | 
 | 45 | +    }  | 
 | 46 | +    super::swar::match_uri_vectored(bytes);  | 
 | 47 | +}  | 
 | 48 | + | 
 | 49 | +const fn bit_set(x: u8) -> bool {  | 
 | 50 | +    // Validates if a byte is a valid header name character  | 
 | 51 | +    // https://tools.ietf.org/html/rfc7230#section-3.2.6  | 
 | 52 | +    matches!(x, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'-' | b'.' | b'^' | b'_' | b'`' | b'|' | b'~')  | 
 | 53 | +}  | 
 | 54 | + | 
 | 55 | +// A 256-bit bitmap, split into two halves  | 
 | 56 | +// lower half contains bits whose higher nibble is <= 7  | 
 | 57 | +// higher half contains bits whose higher nibble is >= 8  | 
 | 58 | +const fn build_bitmap() -> ([u8; 16], [u8; 16]) {  | 
 | 59 | +    let mut bitmap_0_7 = [0u8; 16]; // 0x00..0x7F  | 
 | 60 | +    let mut bitmap_8_15 = [0u8; 16]; // 0x80..0xFF  | 
 | 61 | +    let mut i = 0;  | 
 | 62 | +    while i < 256 {  | 
 | 63 | +        if bit_set(i as u8) {  | 
 | 64 | +            // Nibbles  | 
 | 65 | +            let (lo, hi) = (i & 0x0F, i >> 4);  | 
 | 66 | +            if i < 128 {  | 
 | 67 | +                bitmap_0_7[lo] |= 1 << hi;  | 
 | 68 | +            } else {  | 
 | 69 | +                bitmap_8_15[lo] |= 1 << hi;  | 
 | 70 | +            }  | 
 | 71 | +        }  | 
 | 72 | +        i += 1;  | 
 | 73 | +    }  | 
 | 74 | +    (bitmap_0_7, bitmap_8_15)  | 
 | 75 | +}  | 
 | 76 | + | 
 | 77 | +const BITMAPS: ([u8; 16], [u8; 16]) = build_bitmap();  | 
 | 78 | + | 
 | 79 | +// NOTE: adapted from 256-bit version, with upper 128-bit ops commented out  | 
 | 80 | +#[inline]  | 
 | 81 | +unsafe fn match_header_name_char_16_neon(ptr: *const u8) -> usize {  | 
 | 82 | +    let bitmaps = BITMAPS;  | 
 | 83 | +    // NOTE: ideally compile-time constants  | 
 | 84 | +    let (bitmap_0_7, _bitmap_8_15) = bitmaps;  | 
 | 85 | +    let bitmap_0_7 = vld1q_u8(bitmap_0_7.as_ptr());  | 
 | 86 | +    // let bitmap_8_15 = vld1q_u8(bitmap_8_15.as_ptr());  | 
 | 87 | + | 
 | 88 | +    // Initialize the bitmask_lookup.  | 
 | 89 | +    const BITMASK_LOOKUP_DATA: [u8; 16] =  | 
 | 90 | +        [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];  | 
 | 91 | +    let bitmask_lookup = vld1q_u8(BITMASK_LOOKUP_DATA.as_ptr());  | 
 | 92 | + | 
 | 93 | +    // Load 16 input bytes.  | 
 | 94 | +    let input = vld1q_u8(ptr);  | 
 | 95 | + | 
 | 96 | +    // Extract indices for row_0_7.  | 
 | 97 | +    let indices_0_7 = vandq_u8(input, vdupq_n_u8(0x8F)); // 0b1000_1111;  | 
 | 98 | + | 
 | 99 | +    // Extract indices for row_8_15.  | 
 | 100 | +    // let msb = vandq_u8(input, vdupq_n_u8(0x80));  | 
 | 101 | +    // let indices_8_15 = veorq_u8(indices_0_7, msb);  | 
 | 102 | + | 
 | 103 | +    // Fetch row_0_7 and row_8_15.  | 
 | 104 | +    let row_0_7 = vqtbl1q_u8(bitmap_0_7, indices_0_7);  | 
 | 105 | +    // let row_8_15 = vqtbl1q_u8(bitmap_8_15, indices_8_15);  | 
 | 106 | + | 
 | 107 | +    // Calculate a bitmask, i.e. (1 << hi_nibble % 8).  | 
 | 108 | +    let bitmask = vqtbl1q_u8(bitmask_lookup, vshrq_n_u8(input, 4));  | 
 | 109 | + | 
 | 110 | +    // Choose rows halves depending on higher nibbles.  | 
 | 111 | +    // let bitsets = vorrq_u8(row_0_7, row_8_15);  | 
 | 112 | +    let bitsets = row_0_7;  | 
 | 113 | + | 
 | 114 | +    // Finally check which bytes belong to the set.  | 
 | 115 | +    let tmp = vandq_u8(bitsets, bitmask);  | 
 | 116 | +    let result = vceqq_u8(tmp, bitmask);  | 
 | 117 | + | 
 | 118 | +    offsetz(result) as usize  | 
 | 119 | +}  | 
 | 120 | + | 
 | 121 | +#[inline]  | 
 | 122 | +unsafe fn match_url_char_16_neon(ptr: *const u8) -> usize {  | 
 | 123 | +    let input = vld1q_u8(ptr);  | 
 | 124 | + | 
 | 125 | +    // Check that b'!' <= input <= b'~'  | 
 | 126 | +    let result = vandq_u8(  | 
 | 127 | +        vcleq_u8(vdupq_n_u8(b'!'), input),  | 
 | 128 | +        vcleq_u8(input, vdupq_n_u8(b'~')),  | 
 | 129 | +    );  | 
 | 130 | +    // Check that input != b'<' and input != b'>'  | 
 | 131 | +    let lt = vceqq_u8(input, vdupq_n_u8(b'<'));  | 
 | 132 | +    let gt = vceqq_u8(input, vdupq_n_u8(b'>'));  | 
 | 133 | +    let ltgt = vorrq_u8(lt, gt);  | 
 | 134 | +    // Nand with result  | 
 | 135 | +    let result = vbicq_u8(result, ltgt);  | 
 | 136 | + | 
 | 137 | +    offsetz(result) as usize  | 
 | 138 | +}  | 
 | 139 | + | 
 | 140 | +#[inline]  | 
 | 141 | +unsafe fn match_header_value_char_16_neon(ptr: *const u8) -> usize {  | 
 | 142 | +    let input = vld1q_u8(ptr);  | 
 | 143 | + | 
 | 144 | +    // Check that b' ' <= and b != 127 or b == 9  | 
 | 145 | +    let result = vcleq_u8(vdupq_n_u8(b' '), input);  | 
 | 146 | + | 
 | 147 | +    // Allow tab  | 
 | 148 | +    let tab = vceqq_u8(input, vdupq_n_u8(0x09));  | 
 | 149 | +    let result = vorrq_u8(result, tab);  | 
 | 150 | + | 
 | 151 | +    // Disallow del  | 
 | 152 | +    let del = vceqq_u8(input, vdupq_n_u8(0x7F));  | 
 | 153 | +    let result = vbicq_u8(result, del);  | 
 | 154 | + | 
 | 155 | +    offsetz(result) as usize  | 
 | 156 | +}  | 
 | 157 | + | 
 | 158 | +#[inline]  | 
 | 159 | +unsafe fn offsetz(x: uint8x16_t) -> u32 {  | 
 | 160 | +    // NOT the vector since it's faster to operate with zeros instead  | 
 | 161 | +    offsetnz(vmvnq_u8(x))  | 
 | 162 | +}  | 
 | 163 | + | 
 | 164 | +#[inline]  | 
 | 165 | +unsafe fn offsetnz(x: uint8x16_t) -> u32 {  | 
 | 166 | +    // Extract two u64  | 
 | 167 | +    let x = vreinterpretq_u64_u8(x);  | 
 | 168 | +    let low: u64 = std::mem::transmute(vget_low_u64(x));  | 
 | 169 | +    let high: u64 = std::mem::transmute(vget_high_u64(x));  | 
 | 170 | + | 
 | 171 | +    #[inline]  | 
 | 172 | +    fn clz(x: u64) -> u32 {  | 
 | 173 | +        // perf: rust will unroll this loop  | 
 | 174 | +        // and it's much faster than rbit + clz so voila  | 
 | 175 | +        for (i, b) in x.to_ne_bytes().iter().copied().enumerate() {  | 
 | 176 | +            if b != 0 {  | 
 | 177 | +                return i as u32;  | 
 | 178 | +            }  | 
 | 179 | +        }  | 
 | 180 | +        8 // Technically not reachable since zero-guarded  | 
 | 181 | +    }  | 
 | 182 | + | 
 | 183 | +    if low != 0 {  | 
 | 184 | +        return clz(low);  | 
 | 185 | +    } else if high != 0 {  | 
 | 186 | +        return 8 + clz(high);  | 
 | 187 | +    } else {  | 
 | 188 | +        return 16;  | 
 | 189 | +    }  | 
 | 190 | +}  | 
 | 191 | + | 
 | 192 | +#[test]  | 
 | 193 | +fn neon_code_matches_uri_chars_table() {  | 
 | 194 | +    unsafe {  | 
 | 195 | +        assert!(byte_is_allowed(b'_', match_uri_vectored));  | 
 | 196 | + | 
 | 197 | +        for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() {  | 
 | 198 | +            assert_eq!(  | 
 | 199 | +                byte_is_allowed(b as u8, match_uri_vectored),  | 
 | 200 | +                allowed,  | 
 | 201 | +                "byte_is_allowed({:?}) should be {:?}",  | 
 | 202 | +                b,  | 
 | 203 | +                allowed,  | 
 | 204 | +            );  | 
 | 205 | +        }  | 
 | 206 | +    }  | 
 | 207 | +}  | 
 | 208 | + | 
 | 209 | +#[test]  | 
 | 210 | +fn neon_code_matches_header_value_chars_table() {  | 
 | 211 | +    unsafe {  | 
 | 212 | +        assert!(byte_is_allowed(b'_', match_header_value_vectored));  | 
 | 213 | + | 
 | 214 | +        for (b, allowed) in crate::HEADER_VALUE_MAP.iter().cloned().enumerate() {  | 
 | 215 | +            assert_eq!(  | 
 | 216 | +                byte_is_allowed(b as u8, match_header_value_vectored),  | 
 | 217 | +                allowed,  | 
 | 218 | +                "byte_is_allowed({:?}) should be {:?}",  | 
 | 219 | +                b,  | 
 | 220 | +                allowed,  | 
 | 221 | +            );  | 
 | 222 | +        }  | 
 | 223 | +    }  | 
 | 224 | +}  | 
 | 225 | + | 
 | 226 | +#[test]  | 
 | 227 | +fn neon_code_matches_header_name_chars_table() {  | 
 | 228 | +    unsafe {  | 
 | 229 | +        assert!(byte_is_allowed(b'_', match_header_name_vectored));  | 
 | 230 | + | 
 | 231 | +        for (b, allowed) in crate::HEADER_NAME_MAP.iter().cloned().enumerate() {  | 
 | 232 | +            assert_eq!(  | 
 | 233 | +                byte_is_allowed(b as u8, match_header_name_vectored),  | 
 | 234 | +                allowed,  | 
 | 235 | +                "byte_is_allowed({:?}) should be {:?}",  | 
 | 236 | +                b,  | 
 | 237 | +                allowed,  | 
 | 238 | +            );  | 
 | 239 | +        }  | 
 | 240 | +    }  | 
 | 241 | +}  | 
 | 242 | + | 
 | 243 | +#[cfg(test)]  | 
 | 244 | +unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &mut Bytes<'_>)) -> bool {  | 
 | 245 | +    let mut slice = [b'_'; 16];  | 
 | 246 | +    slice[10] = byte;  | 
 | 247 | +    let mut bytes = Bytes::new(&slice);  | 
 | 248 | + | 
 | 249 | +    f(&mut bytes);  | 
 | 250 | + | 
 | 251 | +    match bytes.pos() {  | 
 | 252 | +        16 => true,  | 
 | 253 | +        10 => false,  | 
 | 254 | +        x => panic!("unexpected pos: {}", x),  | 
 | 255 | +    }  | 
 | 256 | +}  | 
0 commit comments