#ifndef SCANNER_H
#define SCANNER_H
#include <assert.h>
#include <string.h>
#include <stdio.h>
static really_inline uint64_t find_escaped(
uint64_t backslash, uint64_t *is_escaped)
{
backslash &= ~ *is_escaped;
uint64_t follows_escape = backslash << 1 | *is_escaped;
const uint64_t even_bits = 0x5555555555555555ULL;
uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
uint64_t sequences_starting_on_even_bits;
*is_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
uint64_t invert_mask = sequences_starting_on_even_bits << 1;
return (even_bits ^ invert_mask) & follows_escape;
}
static really_inline void find_delimiters(
uint64_t quotes,
uint64_t semicolons,
uint64_t newlines,
uint64_t in_quoted,
uint64_t in_comment,
uint64_t *quoted_,
uint64_t *comment)
{
uint64_t delimiters, starts = quotes | semicolons;
uint64_t end;
assert(!(quotes & semicolons));
end = (newlines & in_comment) | (quotes & in_quoted);
end &= -end;
delimiters = end;
starts &= ~((in_comment | in_quoted) ^ (-end - end));
while (starts) {
const uint64_t start = -starts & starts;
assert(start);
const uint64_t quote = quotes & start;
const uint64_t semicolon = semicolons & start;
end = (newlines & -semicolon) | (quotes & (-quote - quote));
end &= -end;
delimiters |= end | start;
starts &= -end - end;
}
*quoted_ = delimiters & quotes;
*comment = delimiters & ~quotes;
}
static inline uint64_t follows(const uint64_t match, uint64_t *overflow)
{
const uint64_t result = match << 1 | (*overflow);
*overflow = match >> 63;
return result;
}
static const simd_table_t blank = SIMD_TABLE(
0x20,
0x00,
0x00,
0x00,
0x00,
0x00,
0x00,
0x00,
0x00,
0x09,
0x00,
0x00,
0x00,
0x0d,
0x00,
0x00
);
static const simd_table_t special = SIMD_TABLE(
0x00,
0x00,
0x00,
0x00,
0x00,
0x00,
0x00,
0x00,
0x28,
0x29,
0x0a,
0x00,
0x00,
0x00,
0x00,
0x00
);
typedef struct block block_t;
struct block {
simd_8x64_t input;
uint64_t newline;
uint64_t backslash;
uint64_t escaped;
uint64_t comment;
uint64_t quoted;
uint64_t semicolon;
uint64_t in_quoted;
uint64_t in_comment;
uint64_t contiguous;
uint64_t follows_contiguous;
uint64_t blank;
uint64_t special;
};
static really_inline void scan(parser_t *parser, block_t *block)
{
block->newline = simd_find_8x64(&block->input, '\n');
block->backslash = simd_find_8x64(&block->input, '\\');
block->escaped = find_escaped(
block->backslash, &parser->file->state.is_escaped);
block->comment = 0;
block->quoted = simd_find_8x64(&block->input, '"') & ~block->escaped;
block->semicolon = simd_find_8x64(&block->input, ';') & ~block->escaped;
block->in_quoted = parser->file->state.in_quoted;
block->in_comment = parser->file->state.in_comment;
if (block->in_comment || block->semicolon) {
find_delimiters(
block->quoted,
block->semicolon,
block->newline,
block->in_quoted,
block->in_comment,
&block->quoted,
&block->comment);
block->in_quoted ^= prefix_xor(block->quoted);
parser->file->state.in_quoted = (uint64_t)((int64_t)block->in_quoted >> 63);
block->in_comment ^= prefix_xor(block->comment);
parser->file->state.in_comment = (uint64_t)((int64_t)block->in_comment >> 63);
} else {
block->in_quoted ^= prefix_xor(block->quoted);
parser->file->state.in_quoted = (uint64_t)((int64_t)block->in_quoted >> 63);
}
block->blank =
simd_find_any_8x64(&block->input, blank) & ~(block->escaped | block->in_quoted | block->in_comment);
block->special =
simd_find_any_8x64(&block->input, special) & ~(block->escaped | block->in_quoted | block->in_comment);
block->contiguous =
~(block->blank | block->special | block->quoted) & ~(block->in_quoted | block->in_comment);
block->follows_contiguous =
follows(block->contiguous, &parser->file->state.follows_contiguous);
}
static really_inline void write_indexes(parser_t *parser, const block_t *block, uint64_t clear)
{
uint64_t fields = (block->contiguous & ~block->follows_contiguous) |
(block->quoted & block->in_quoted) |
(block->special);
uint64_t delimiters = (~block->contiguous & block->follows_contiguous) |
(block->quoted & ~block->in_quoted);
fields &= ~clear;
delimiters &= ~clear;
const char *base = parser->file->buffer.data + parser->file->buffer.index;
uint64_t field_count = count_ones(fields);
uint64_t delimiter_count = count_ones(delimiters);
uint64_t count = field_count;
if (delimiter_count > field_count)
count = delimiter_count;
uint64_t newlines = block->newline & (block->contiguous | block->in_quoted);
if (unlikely(*parser->file->newlines.tail || newlines)) {
for (uint64_t i=0; i < count; i++) {
const uint64_t field = fields & -fields;
const uint64_t delimiter = delimiters & -delimiters;
if (field & block->newline) {
*parser->file->newlines.tail += count_ones(newlines & (field - 1));
if (*parser->file->newlines.tail) {
parser->file->fields.tail[i] = line_feed;
parser->file->newlines.tail++;
} else {
parser->file->fields.tail[i] = base + trailing_zeroes(field);
}
newlines &= -field;
} else {
parser->file->fields.tail[i] = base + trailing_zeroes(field);
}
parser->file->delimiters.tail[i] = base + trailing_zeroes(delimiter);
fields &= ~field;
delimiters &= ~delimiter;
}
*parser->file->newlines.tail += count_ones(newlines);
parser->file->fields.tail += field_count;
parser->file->delimiters.tail += delimiter_count;
} else {
for (uint64_t i=0; i < 6; i++) {
parser->file->fields.tail[i] = base + trailing_zeroes(fields);
parser->file->delimiters.tail[i] = base + trailing_zeroes(delimiters);
fields = clear_lowest_bit(fields);
delimiters = clear_lowest_bit(delimiters);
}
if (unlikely(count > 6)) {
for (uint64_t i=6; i < 12; i++) {
parser->file->fields.tail[i] = base + trailing_zeroes(fields);
parser->file->delimiters.tail[i] = base + trailing_zeroes(delimiters);
fields = clear_lowest_bit(fields);
delimiters = clear_lowest_bit(delimiters);
}
if (unlikely(count > 12)) {
for (uint64_t i=12; i < count; i++) {
parser->file->fields.tail[i] = base + trailing_zeroes(fields);
parser->file->delimiters.tail[i] = base + trailing_zeroes(delimiters);
fields = clear_lowest_bit(fields);
delimiters = clear_lowest_bit(delimiters);
}
}
}
parser->file->fields.tail += field_count;
parser->file->delimiters.tail += delimiter_count;
}
}
nonnull_all
warn_unused_result
static really_inline int32_t reindex(parser_t *parser)
{
block_t block = { 0 };
assert(parser->file->buffer.index <= parser->file->buffer.length);
size_t left = parser->file->buffer.length - parser->file->buffer.index;
const char *data = parser->file->buffer.data + parser->file->buffer.index;
const char **tape = parser->file->fields.tail;
const char **tape_limit = parser->file->fields.tape + ZONE_TAPE_SIZE;
if (left >= ZONE_BLOCK_SIZE) {
const char *data_limit = parser->file->buffer.data +
(parser->file->buffer.length - ZONE_BLOCK_SIZE);
while (data <= data_limit && ((uintptr_t)tape_limit - (uintptr_t)tape) >= ZONE_BLOCK_SIZE) {
simd_loadu_8x64(&block.input, (const uint8_t *)data);
scan(parser, &block);
write_indexes(parser, &block, 0);
parser->file->buffer.index += ZONE_BLOCK_SIZE;
data += ZONE_BLOCK_SIZE;
tape = parser->file->fields.tail;
}
assert(parser->file->buffer.index <= parser->file->buffer.length);
left = parser->file->buffer.length - parser->file->buffer.index;
}
if (parser->file->end_of_file) {
assert(left < ZONE_BLOCK_SIZE);
if (!left) {
parser->file->end_of_file = NO_MORE_DATA;
} else if (((uintptr_t)tape_limit - (uintptr_t)tape) >= left) {
uint8_t buffer[ZONE_BLOCK_SIZE] = { 0 };
memcpy(buffer, data, left);
const uint64_t clear = ~((1llu << left) - 1);
simd_loadu_8x64(&block.input, buffer);
scan(parser, &block);
block.contiguous &= ~clear;
write_indexes(parser, &block, clear);
parser->file->end_of_file = NO_MORE_DATA;
parser->file->buffer.index += left;
}
}
return (uint64_t)((int64_t)(block.contiguous | block.in_quoted) >> 63) != 0;
}
#endif