/* Copyright (c) 2012, Lambda Foundry, Inc., except where noted Incorporates components of WarrenWeckesser/textreader, licensed under 3-clause BSD See LICENSE for the license */ /* Low-level ascii-file processing for pandas. Combines some elements from Python's built-in csv module and Warren Weckesser's textreader project on GitHub. See Python Software Foundation License and BSD licenses for these. */ #include "tokenizer.h" #include #include #include #include "../headers/portable.h" void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, int64_t start) { // column i, starting at 0 self->words = parser->words; self->col = i; self->line_start = parser->line_start + start; } static void free_if_not_null(void **ptr) { TRACE(("free_if_not_null %p\n", *ptr)) if (*ptr != NULL) { free(*ptr); *ptr = NULL; } } /* Parser / tokenizer */ static void *grow_buffer(void *buffer, uint64_t length, uint64_t *capacity, int64_t space, int64_t elsize, int *error) { uint64_t cap = *capacity; void *newbuffer = buffer; // Can we fit potentially nbytes tokens (+ null terminators) in the stream? while ((length + space >= cap) && (newbuffer != NULL)) { cap = cap ? cap << 1 : 2; buffer = newbuffer; newbuffer = realloc(newbuffer, elsize * cap); } if (newbuffer == NULL) { // realloc failed so don't change *capacity, set *error to errno // and return the last good realloc'd buffer so it can be freed *error = errno; newbuffer = buffer; } else { // realloc worked, update *capacity and set *error to 0 // sigh, multiple return values *capacity = cap; *error = 0; } return newbuffer; } void parser_set_default_options(parser_t *self) { self->decimal = '.'; self->sci = 'E'; // For tokenization self->state = START_RECORD; self->delimiter = ','; // XXX self->delim_whitespace = 0; self->doublequote = 0; self->quotechar = '"'; self->escapechar = 0; self->lineterminator = '\0'; /* NUL->standard logic */ self->skipinitialspace = 0; self->quoting = QUOTE_MINIMAL; self->allow_embedded_newline = 1; self->expected_fields = -1; self->on_bad_lines = ERROR; self->commentchar = '#'; self->thousands = '\0'; self->skipset = NULL; self->skipfunc = NULL; self->skip_first_N_rows = -1; self->skip_footer = 0; } parser_t *parser_new() { return (parser_t *)calloc(1, sizeof(parser_t)); } int parser_clear_data_buffers(parser_t *self) { free_if_not_null((void *)&self->stream); free_if_not_null((void *)&self->words); free_if_not_null((void *)&self->word_starts); free_if_not_null((void *)&self->line_start); free_if_not_null((void *)&self->line_fields); return 0; } int parser_cleanup(parser_t *self) { int status = 0; // XXX where to put this free_if_not_null((void *)&self->error_msg); free_if_not_null((void *)&self->warn_msg); if (self->skipset != NULL) { kh_destroy_int64((kh_int64_t *)self->skipset); self->skipset = NULL; } if (parser_clear_data_buffers(self) < 0) { status = -1; } if (self->cb_cleanup != NULL) { if (self->cb_cleanup(self->source) < 0) { status = -1; } self->cb_cleanup = NULL; } return status; } int parser_init(parser_t *self) { int64_t sz; /* Initialize data buffers */ self->stream = NULL; self->words = NULL; self->word_starts = NULL; self->line_start = NULL; self->line_fields = NULL; self->error_msg = NULL; self->warn_msg = NULL; // token stream self->stream = malloc(STREAM_INIT_SIZE * sizeof(char)); if (self->stream == NULL) { parser_cleanup(self); return PARSER_OUT_OF_MEMORY; } self->stream_cap = STREAM_INIT_SIZE; self->stream_len = 0; // word pointers and metadata sz = STREAM_INIT_SIZE / 10; sz = sz ? sz : 1; self->words = malloc(sz * sizeof(char *)); self->word_starts = malloc(sz * sizeof(int64_t)); self->max_words_cap = sz; self->words_cap = sz; self->words_len = 0; // line pointers and metadata self->line_start = malloc(sz * sizeof(int64_t)); self->line_fields = malloc(sz * sizeof(int64_t)); self->lines_cap = sz; self->lines = 0; self->file_lines = 0; if (self->stream == NULL || self->words == NULL || self->word_starts == NULL || self->line_start == NULL || self->line_fields == NULL) { parser_cleanup(self); return PARSER_OUT_OF_MEMORY; } /* amount of bytes buffered */ self->datalen = 0; self->datapos = 0; self->line_start[0] = 0; self->line_fields[0] = 0; self->pword_start = self->stream; self->word_start = 0; self->state = START_RECORD; self->error_msg = NULL; self->warn_msg = NULL; self->commentchar = '\0'; return 0; } void parser_free(parser_t *self) { // opposite of parser_init parser_cleanup(self); } void parser_del(parser_t *self) { free(self); } static int make_stream_space(parser_t *self, size_t nbytes) { uint64_t i, cap, length; int status; void *orig_ptr, *newptr; // Can we fit potentially nbytes tokens (+ null terminators) in the stream? /* TOKEN STREAM */ orig_ptr = (void *)self->stream; TRACE( ("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", nbytes)) self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len, &self->stream_cap, nbytes * 2, sizeof(char), &status); TRACE( ("make_stream_space: self->stream=%p, self->stream_len = %zu, " "self->stream_cap=%zu, status=%zu\n", self->stream, self->stream_len, self->stream_cap, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; } // realloc sets errno when moving buffer? if (self->stream != orig_ptr) { self->pword_start = self->stream + self->word_start; for (i = 0; i < self->words_len; ++i) { self->words[i] = self->stream + self->word_starts[i]; } } /* WORD VECTORS */ cap = self->words_cap; /** * If we are reading in chunks, we need to be aware of the maximum number * of words we have seen in previous chunks (self->max_words_cap), so * that way, we can properly allocate when reading subsequent ones. * * Otherwise, we risk a buffer overflow if we mistakenly under-allocate * just because a recent chunk did not have as many words. */ if (self->words_len + nbytes < self->max_words_cap) { length = self->max_words_cap - nbytes - 1; } else { length = self->words_len; } self->words = (char **)grow_buffer((void *)self->words, length, &self->words_cap, nbytes, sizeof(char *), &status); TRACE( ("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, " "%d)\n", self->words_len, self->words_cap, nbytes, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; } // realloc took place if (cap != self->words_cap) { TRACE( ("make_stream_space: cap != self->words_cap, nbytes = %d, " "self->words_cap=%d\n", nbytes, self->words_cap)) newptr = realloc((void *)self->word_starts, sizeof(int64_t) * self->words_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { self->word_starts = (int64_t *)newptr; } } /* LINE VECTORS */ cap = self->lines_cap; self->line_start = (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1, &self->lines_cap, nbytes, sizeof(int64_t), &status); TRACE(( "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", self->lines + 1, self->lines_cap, nbytes, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; } // realloc took place if (cap != self->lines_cap) { TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", nbytes)) newptr = realloc((void *)self->line_fields, sizeof(int64_t) * self->lines_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { self->line_fields = (int64_t *)newptr; } } return 0; } static int push_char(parser_t *self, char c) { TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", self->stream_len + 1, c, self->stream_cap)) if (self->stream_len >= self->stream_cap) { TRACE( ("push_char: ERROR!!! self->stream_len(%d) >= " "self->stream_cap(%d)\n", self->stream_len, self->stream_cap)) int64_t bufsize = 100; self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - possible malformed input file.\n"); return PARSER_OUT_OF_MEMORY; } self->stream[self->stream_len++] = c; return 0; } int PANDAS_INLINE end_field(parser_t *self) { // XXX cruft if (self->words_len >= self->words_cap) { TRACE( ("end_field: ERROR!!! self->words_len(%zu) >= " "self->words_cap(%zu)\n", self->words_len, self->words_cap)) int64_t bufsize = 100; self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - possible malformed input file.\n"); return PARSER_OUT_OF_MEMORY; } // null terminate token push_char(self, '\0'); // set pointer and metadata self->words[self->words_len] = self->pword_start; TRACE(("end_field: Char diff: %d\n", self->pword_start - self->words[0])); TRACE(("end_field: Saw word %s at: %d. Total: %d\n", self->pword_start, self->word_start, self->words_len + 1)) self->word_starts[self->words_len] = self->word_start; self->words_len++; // increment line field count self->line_fields[self->lines]++; // New field begin in stream self->pword_start = self->stream + self->stream_len; self->word_start = self->stream_len; return 0; } static void append_warning(parser_t *self, const char *msg) { int64_t ex_length; int64_t length = strlen(msg); void *newptr; if (self->warn_msg == NULL) { self->warn_msg = malloc(length + 1); snprintf(self->warn_msg, length + 1, "%s", msg); } else { ex_length = strlen(self->warn_msg); newptr = realloc(self->warn_msg, ex_length + length + 1); if (newptr != NULL) { self->warn_msg = (char *)newptr; snprintf(self->warn_msg + ex_length, length + 1, "%s", msg); } } } static int end_line(parser_t *self) { char *msg; int64_t fields; int64_t ex_fields = self->expected_fields; int64_t bufsize = 100; // for error or warning messages fields = self->line_fields[self->lines]; TRACE(("end_line: Line end, nfields: %d\n", fields)); TRACE(("end_line: lines: %d\n", self->lines)); if (self->lines > 0) { if (self->expected_fields >= 0) { ex_fields = self->expected_fields; } else { ex_fields = self->line_fields[self->lines - 1]; } } TRACE(("end_line: ex_fields: %d\n", ex_fields)); if (self->state == START_FIELD_IN_SKIP_LINE || self->state == IN_FIELD_IN_SKIP_LINE || self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE) { TRACE(("end_line: Skipping row %d\n", self->file_lines)); // increment file line count self->file_lines++; // skip the tokens from this bad line self->line_start[self->lines] += fields; // reset field count self->line_fields[self->lines] = 0; return 0; } if (!(self->lines <= self->header_end + 1) && (fields > ex_fields) && !(self->usecols)) { // increment file line count self->file_lines++; // skip the tokens from this bad line self->line_start[self->lines] += fields; // reset field count self->line_fields[self->lines] = 0; // file_lines is now the actual file line number (starting at 1) if (self->on_bad_lines == ERROR) { self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "Expected %" PRId64 " fields in line %" PRIu64 ", saw %" PRId64 "\n", ex_fields, self->file_lines, fields); TRACE(("Error at line %d, %d fields\n", self->file_lines, fields)); return -1; } else { // simply skip bad lines if (self->on_bad_lines == WARN) { // pass up error message msg = malloc(bufsize); snprintf(msg, bufsize, "Skipping line %" PRIu64 ": expected %" PRId64 " fields, saw %" PRId64 "\n", self->file_lines, ex_fields, fields); append_warning(self, msg); free(msg); } } } else { // missing trailing delimiters if ((self->lines >= self->header_end + 1) && fields < ex_fields) { // might overrun the buffer when closing fields if (make_stream_space(self, ex_fields - fields) < 0) { int64_t bufsize = 100; self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "out of memory"); return -1; } while (fields < ex_fields) { end_field(self); fields++; } } // increment both line counts self->file_lines++; self->lines++; // good line, set new start point if (self->lines >= self->lines_cap) { TRACE(( "end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", self->lines, self->lines_cap)) int64_t bufsize = 100; self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - " "possible malformed input file.\n"); return PARSER_OUT_OF_MEMORY; } self->line_start[self->lines] = (self->line_start[self->lines - 1] + fields); TRACE( ("end_line: new line start: %d\n", self->line_start[self->lines])); // new line start with 0 fields self->line_fields[self->lines] = 0; } TRACE(("end_line: Finished line, at %d\n", self->lines)); return 0; } int parser_add_skiprow(parser_t *self, int64_t row) { khiter_t k; kh_int64_t *set; int ret = 0; if (self->skipset == NULL) { self->skipset = (void *)kh_init_int64(); } set = (kh_int64_t *)self->skipset; k = kh_put_int64(set, row, &ret); set->keys[k] = row; return 0; } int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) { // self->file_lines is zero based so subtract 1 from nrows if (nrows > 0) { self->skip_first_N_rows = nrows - 1; } return 0; } static int parser_buffer_bytes(parser_t *self, size_t nbytes, const char *encoding_errors) { int status; size_t bytes_read; status = 0; self->datapos = 0; self->data = self->cb_io(self->source, nbytes, &bytes_read, &status, encoding_errors); TRACE(( "parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", nbytes, bytes_read, status)); self->datalen = bytes_read; if (status != REACHED_EOF && self->data == NULL) { int64_t bufsize = 200; self->error_msg = malloc(bufsize); if (status == CALLING_READ_FAILED) { snprintf(self->error_msg, bufsize, "Calling read(nbytes) on source failed. " "Try engine='python'."); } else { snprintf(self->error_msg, bufsize, "Unknown error in IO callback"); } return -1; } TRACE(("datalen: %d\n", self->datalen)); return status; } /* Tokenization macros and state machine code */ #define PUSH_CHAR(c) \ TRACE( \ ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \ c, slen, self->stream_cap, self->stream_len)) \ if (slen >= self->stream_cap) { \ TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ self->stream_cap)) \ int64_t bufsize = 100; \ self->error_msg = malloc(bufsize); \ snprintf(self->error_msg, bufsize, \ "Buffer overflow caught - possible malformed input file.\n");\ return PARSER_OUT_OF_MEMORY; \ } \ *stream++ = c; \ slen++; // This is a little bit of a hack but works for now #define END_FIELD() \ self->stream_len = slen; \ if (end_field(self) < 0) { \ goto parsingerror; \ } \ stream = self->stream + self->stream_len; \ slen = self->stream_len; #define END_LINE_STATE(STATE) \ self->stream_len = slen; \ if (end_line(self) < 0) { \ goto parsingerror; \ } \ stream = self->stream + self->stream_len; \ slen = self->stream_len; \ self->state = STATE; \ if (line_limit > 0 && self->lines == start_lines + line_limit) { \ goto linelimit; \ } #define END_LINE_AND_FIELD_STATE(STATE) \ self->stream_len = slen; \ if (end_line(self) < 0) { \ goto parsingerror; \ } \ if (end_field(self) < 0) { \ goto parsingerror; \ } \ stream = self->stream + self->stream_len; \ slen = self->stream_len; \ self->state = STATE; \ if (line_limit > 0 && self->lines == start_lines + line_limit) { \ goto linelimit; \ } #define END_LINE() END_LINE_STATE(START_RECORD) #define IS_TERMINATOR(c) \ (c == line_terminator) #define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE)) // don't parse '\r' with a custom line terminator #define IS_CARRIAGE(c) (c == carriage_symbol) #define IS_COMMENT_CHAR(c) (c == comment_symbol) #define IS_ESCAPE_CHAR(c) (c == escape_symbol) #define IS_SKIPPABLE_SPACE(c) \ ((!self->delim_whitespace && c == ' ' && self->skipinitialspace)) // applied when in a field #define IS_DELIMITER(c) \ ((!self->delim_whitespace && c == self->delimiter) || \ (self->delim_whitespace && isblank(c))) #define _TOKEN_CLEANUP() \ self->stream_len = slen; \ self->datapos = i; \ TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, \ self->datalen)); #define CHECK_FOR_BOM() \ if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \ buf += 3; \ self->datapos += 3; \ } int skip_this_line(parser_t *self, int64_t rownum) { int should_skip; PyObject *result; PyGILState_STATE state; if (self->skipfunc != NULL) { state = PyGILState_Ensure(); result = PyObject_CallFunction(self->skipfunc, "i", rownum); // Error occurred. It will be processed // and caught at the Cython level. if (result == NULL) { should_skip = -1; } else { should_skip = PyObject_IsTrue(result); } Py_XDECREF(result); PyGILState_Release(state); return should_skip; } else if (self->skipset != NULL) { return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) != ((kh_int64_t *)self->skipset)->n_buckets); } else { return (rownum <= self->skip_first_N_rows); } } int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) { int64_t i; uint64_t slen; int should_skip; char c; char *stream; char *buf = self->data + self->datapos; const char line_terminator = (self->lineterminator == '\0') ? '\n' : self->lineterminator; // 1000 is something that couldn't fit in "char" // thus comparing a char to it would always be "false" const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000; const int comment_symbol = (self->commentchar != '\0') ? self->commentchar : 1000; const int escape_symbol = (self->escapechar != '\0') ? self->escapechar : 1000; if (make_stream_space(self, self->datalen - self->datapos) < 0) { int64_t bufsize = 100; self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "out of memory"); return -1; } stream = self->stream + self->stream_len; slen = self->stream_len; TRACE(("%s\n", buf)); if (self->file_lines == 0) { CHECK_FOR_BOM(); } for (i = self->datapos; i < self->datalen; ++i) { // next character in file c = *buf++; TRACE( ("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, " "state %d\n", i, c, self->file_lines + 1, self->line_fields[self->lines], self->state)); switch (self->state) { case START_FIELD_IN_SKIP_LINE: if (IS_TERMINATOR(c)) { END_LINE(); } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; } else if (IS_QUOTE(c)) { self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; } else if (IS_DELIMITER(c)) { // Do nothing, we're starting a new field again. } else { self->state = IN_FIELD_IN_SKIP_LINE; } break; case IN_FIELD_IN_SKIP_LINE: if (IS_TERMINATOR(c)) { END_LINE(); } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; } else if (IS_DELIMITER(c)) { self->state = START_FIELD_IN_SKIP_LINE; } break; case IN_QUOTED_FIELD_IN_SKIP_LINE: if (IS_QUOTE(c)) { if (self->doublequote) { self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE; } else { self->state = IN_FIELD_IN_SKIP_LINE; } } break; case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE: if (IS_QUOTE(c)) { self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; } else if (IS_TERMINATOR(c)) { END_LINE(); } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; } else if (IS_DELIMITER(c)) { self->state = START_FIELD_IN_SKIP_LINE; } else { self->state = IN_FIELD_IN_SKIP_LINE; } break; case WHITESPACE_LINE: if (IS_TERMINATOR(c)) { self->file_lines++; self->state = START_RECORD; break; } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; break; } else if (!self->delim_whitespace) { if (isblank(c) && c != self->delimiter) { } else { // backtrack // use i + 1 because buf has been incremented but not i do { --buf; --i; } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf)); // reached a newline rather than the beginning if (IS_TERMINATOR(*buf)) { ++buf; // move pointer to first char after newline ++i; } self->state = START_FIELD; } break; } // fall through case EAT_WHITESPACE: if (IS_TERMINATOR(c)) { END_LINE(); self->state = START_RECORD; break; } else if (IS_CARRIAGE(c)) { self->state = EAT_CRNL; break; } else if (IS_COMMENT_CHAR(c)) { self->state = EAT_COMMENT; break; } else if (!isblank(c)) { self->state = START_FIELD; // fall through to subsequent state } else { // if whitespace char, keep slurping break; } case START_RECORD: // start of record should_skip = skip_this_line(self, self->file_lines); if (should_skip == -1) { goto parsingerror; } else if (should_skip) { if (IS_QUOTE(c)) { self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; } else { self->state = IN_FIELD_IN_SKIP_LINE; if (IS_TERMINATOR(c)) { END_LINE(); } } break; } else if (IS_TERMINATOR(c)) { // \n\r possible? if (self->skip_empty_lines) { self->file_lines++; } else { END_LINE(); } break; } else if (IS_CARRIAGE(c)) { if (self->skip_empty_lines) { self->file_lines++; self->state = EAT_CRNL_NOP; } else { self->state = EAT_CRNL; } break; } else if (IS_COMMENT_CHAR(c)) { self->state = EAT_LINE_COMMENT; break; } else if (isblank(c)) { if (self->delim_whitespace) { if (self->skip_empty_lines) { self->state = WHITESPACE_LINE; } else { self->state = EAT_WHITESPACE; } break; } else if (c != self->delimiter && self->skip_empty_lines) { self->state = WHITESPACE_LINE; break; } // fall through } // normal character - fall through // to handle as START_FIELD self->state = START_FIELD; case START_FIELD: // expecting field if (IS_TERMINATOR(c)) { END_FIELD(); END_LINE(); } else if (IS_CARRIAGE(c)) { END_FIELD(); self->state = EAT_CRNL; } else if (IS_QUOTE(c)) { // start quoted field self->state = IN_QUOTED_FIELD; } else if (IS_ESCAPE_CHAR(c)) { // possible escaped character self->state = ESCAPED_CHAR; } else if (IS_SKIPPABLE_SPACE(c)) { // ignore space at start of field } else if (IS_DELIMITER(c)) { if (self->delim_whitespace) { self->state = EAT_WHITESPACE; } else { // save empty field END_FIELD(); } } else if (IS_COMMENT_CHAR(c)) { END_FIELD(); self->state = EAT_COMMENT; } else { // begin new unquoted field PUSH_CHAR(c); self->state = IN_FIELD; } break; case ESCAPED_CHAR: PUSH_CHAR(c); self->state = IN_FIELD; break; case EAT_LINE_COMMENT: if (IS_TERMINATOR(c)) { self->file_lines++; self->state = START_RECORD; } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; } break; case IN_FIELD: // in unquoted field if (IS_TERMINATOR(c)) { END_FIELD(); END_LINE(); } else if (IS_CARRIAGE(c)) { END_FIELD(); self->state = EAT_CRNL; } else if (IS_ESCAPE_CHAR(c)) { // possible escaped character self->state = ESCAPED_CHAR; } else if (IS_DELIMITER(c)) { // end of field - end of line not reached yet END_FIELD(); if (self->delim_whitespace) { self->state = EAT_WHITESPACE; } else { self->state = START_FIELD; } } else if (IS_COMMENT_CHAR(c)) { END_FIELD(); self->state = EAT_COMMENT; } else { // normal character - save in field PUSH_CHAR(c); } break; case IN_QUOTED_FIELD: // in quoted field if (IS_ESCAPE_CHAR(c)) { // possible escape character self->state = ESCAPE_IN_QUOTED_FIELD; } else if (IS_QUOTE(c)) { if (self->doublequote) { // double quote - " represented by "" self->state = QUOTE_IN_QUOTED_FIELD; } else { // end of quote part of field self->state = IN_FIELD; } } else { // normal character - save in field PUSH_CHAR(c); } break; case ESCAPE_IN_QUOTED_FIELD: PUSH_CHAR(c); self->state = IN_QUOTED_FIELD; break; case QUOTE_IN_QUOTED_FIELD: // double quote - seen a quote in an quoted field if (IS_QUOTE(c)) { // save "" as " PUSH_CHAR(c); self->state = IN_QUOTED_FIELD; } else if (IS_DELIMITER(c)) { // end of field - end of line not reached yet END_FIELD(); if (self->delim_whitespace) { self->state = EAT_WHITESPACE; } else { self->state = START_FIELD; } } else if (IS_TERMINATOR(c)) { END_FIELD(); END_LINE(); } else if (IS_CARRIAGE(c)) { END_FIELD(); self->state = EAT_CRNL; } else { PUSH_CHAR(c); self->state = IN_FIELD; } break; case EAT_COMMENT: if (IS_TERMINATOR(c)) { END_LINE(); } else if (IS_CARRIAGE(c)) { self->state = EAT_CRNL; } break; // only occurs with non-custom line terminator, // which is why we directly check for '\n' case EAT_CRNL: if (c == '\n') { END_LINE(); } else if (IS_DELIMITER(c)) { if (self->delim_whitespace) { END_LINE_STATE(EAT_WHITESPACE); } else { // Handle \r-delimited files END_LINE_AND_FIELD_STATE(START_FIELD); } } else { if (self->delim_whitespace) { /* XXX * first character of a new record--need to back up and * reread * to handle properly... */ i--; buf--; // back up one character (HACK!) END_LINE_STATE(START_RECORD); } else { // \r line terminator // UGH. we don't actually want // to consume the token. fix this later self->stream_len = slen; if (end_line(self) < 0) { goto parsingerror; } stream = self->stream + self->stream_len; slen = self->stream_len; self->state = START_RECORD; --i; buf--; // let's try this character again (HACK!) if (line_limit > 0 && self->lines == start_lines + line_limit) { goto linelimit; } } } break; // only occurs with non-custom line terminator, // which is why we directly check for '\n' case EAT_CRNL_NOP: // inside an ignored comment line self->state = START_RECORD; // \r line terminator -- parse this character again if (c != '\n' && !IS_DELIMITER(c)) { --i; --buf; } break; default: break; } } _TOKEN_CLEANUP(); TRACE(("Finished tokenizing input\n")) return 0; parsingerror: i++; _TOKEN_CLEANUP(); return -1; linelimit: i++; _TOKEN_CLEANUP(); return 0; } static int parser_handle_eof(parser_t *self) { int64_t bufsize = 100; TRACE( ("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) if (self->datalen != 0) return -1; switch (self->state) { case START_RECORD: case WHITESPACE_LINE: case EAT_CRNL_NOP: case EAT_LINE_COMMENT: return 0; case ESCAPE_IN_QUOTED_FIELD: case IN_QUOTED_FIELD: self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "EOF inside string starting at row %" PRIu64, self->file_lines); return -1; case ESCAPED_CHAR: self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "EOF following escape character"); return -1; case IN_FIELD: case START_FIELD: case QUOTE_IN_QUOTED_FIELD: if (end_field(self) < 0) return -1; break; default: break; } if (end_line(self) < 0) return -1; else return 0; } int parser_consume_rows(parser_t *self, size_t nrows) { int64_t offset, word_deletions; uint64_t char_count, i; if (nrows > self->lines) { nrows = self->lines; } /* do nothing */ if (nrows == 0) return 0; /* cannot guarantee that nrows + 1 has been observed */ word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1]; if (word_deletions >= 1) { char_count = (self->word_starts[word_deletions - 1] + strlen(self->words[word_deletions - 1]) + 1); } else { /* if word_deletions == 0 (i.e. this case) then char_count must * be 0 too, as no data needs to be skipped */ char_count = 0; } TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, char_count)); /* move stream, only if something to move */ if (char_count < self->stream_len) { memmove(self->stream, (self->stream + char_count), self->stream_len - char_count); } /* buffer counts */ self->stream_len -= char_count; /* move token metadata */ // Note: We should always have words_len < word_deletions, so this // subtraction will remain appropriately-typed. for (i = 0; i < self->words_len - word_deletions; ++i) { offset = i + word_deletions; self->words[i] = self->words[offset] - char_count; self->word_starts[i] = self->word_starts[offset] - char_count; } self->words_len -= word_deletions; /* move current word pointer to stream */ self->pword_start -= char_count; self->word_start -= char_count; /* move line metadata */ // Note: We should always have self->lines - nrows + 1 >= 0, so this // subtraction will remain appropriately-typed. for (i = 0; i < self->lines - nrows + 1; ++i) { offset = i + nrows; self->line_start[i] = self->line_start[offset] - word_deletions; self->line_fields[i] = self->line_fields[offset]; } self->lines -= nrows; return 0; } static size_t _next_pow2(size_t sz) { size_t result = 1; while (result < sz) result *= 2; return result; } int parser_trim_buffers(parser_t *self) { /* Free memory */ size_t new_cap; void *newptr; uint64_t i; /** * Before we free up space and trim, we should * save how many words we saw when parsing, if * it exceeds the maximum number we saw before. * * This is important for when we read in chunks, * so that we can inform subsequent chunk parsing * as to how many words we could possibly see. */ if (self->words_cap > self->max_words_cap) { self->max_words_cap = self->words_cap; } /* trim words, word_starts */ new_cap = _next_pow2(self->words_len) + 1; if (new_cap < self->words_cap) { TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); self->words = realloc(self->words, new_cap * sizeof(char *)); if (self->words == NULL) { return PARSER_OUT_OF_MEMORY; } self->word_starts = realloc(self->word_starts, new_cap * sizeof(int64_t)); if (self->word_starts == NULL) { return PARSER_OUT_OF_MEMORY; } self->words_cap = new_cap; } /* trim stream */ new_cap = _next_pow2(self->stream_len) + 1; TRACE( ("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = " "%zu\n", new_cap, self->stream_cap, self->lines_cap)); if (new_cap < self->stream_cap) { TRACE( ("parser_trim_buffers: new_cap < self->stream_cap, calling " "realloc\n")); newptr = realloc(self->stream, new_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { // Update the pointers in the self->words array (char **) if // `realloc` // moved the `self->stream` buffer. This block mirrors a similar // block in // `make_stream_space`. if (self->stream != newptr) { self->pword_start = (char *)newptr + self->word_start; for (i = 0; i < self->words_len; ++i) { self->words[i] = (char *)newptr + self->word_starts[i]; } } self->stream = newptr; self->stream_cap = new_cap; } } /* trim line_start, line_fields */ new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); newptr = realloc(self->line_start, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { self->line_start = newptr; } newptr = realloc(self->line_fields, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { self->line_fields = newptr; self->lines_cap = new_cap; } } return 0; } /* nrows : number of rows to tokenize (or until reach EOF) all : tokenize all the data vs. certain number of rows */ int _tokenize_helper(parser_t *self, size_t nrows, int all, const char *encoding_errors) { int status = 0; uint64_t start_lines = self->lines; if (self->state == FINISHED) { return 0; } TRACE(( "_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n", nrows, self->datapos, self->datalen)); while (1) { if (!all && self->lines - start_lines >= nrows) break; if (self->datapos == self->datalen) { status = parser_buffer_bytes(self, self->chunksize, encoding_errors); if (status == REACHED_EOF) { // close out last line status = parser_handle_eof(self); self->state = FINISHED; break; } else if (status != 0) { return status; } } TRACE( ("_tokenize_helper: Trying to process %d bytes, datalen=%d, " "datapos= %d\n", self->datalen - self->datapos, self->datalen, self->datapos)); status = tokenize_bytes(self, nrows, start_lines); if (status < 0) { // XXX TRACE( ("_tokenize_helper: Status %d returned from tokenize_bytes, " "breaking\n", status)); status = -1; break; } } TRACE(("leaving tokenize_helper\n")); return status; } int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) { int status = _tokenize_helper(self, nrows, 0, encoding_errors); return status; } int tokenize_all_rows(parser_t *self, const char *encoding_errors) { int status = _tokenize_helper(self, -1, 1, encoding_errors); return status; } /* * Function: to_boolean * -------------------- * * Validate if item should be recognized as a boolean field. * * item: const char* representing parsed text * val : pointer to a uint8_t of boolean representation * * If item is determined to be boolean, this method will set * the appropriate value of val and return 0. A non-zero exit * status means that item was not inferred to be boolean, and * leaves the value of *val unmodified. */ int to_boolean(const char *item, uint8_t *val) { if (strcasecmp(item, "TRUE") == 0) { *val = 1; return 0; } else if (strcasecmp(item, "FALSE") == 0) { *val = 0; return 0; } return -1; } // --------------------------------------------------------------------------- // Implementation of xstrtod // // strtod.c // // Convert string to double // // Copyright (C) 2002 Michael Ringgaard. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // 3. Neither the name of the project nor the names of its contributors // may be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE // LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF // SUCH DAMAGE. // // ----------------------------------------------------------------------- // Modifications by Warren Weckesser, March 2011: // * Rename strtod() to xstrtod(). // * Added decimal and sci arguments. // * Skip trailing spaces. // * Commented out the other functions. // Modifications by Richard T Guy, August 2013: // * Add tsep argument for thousands separator // // pessimistic but quick assessment, // assuming that each decimal digit requires 4 bits to store const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4; double xstrtod(const char *str, char **endptr, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { double number; unsigned int i_number = 0; int exponent; int negative; char *p = (char *)str; double p10; int n; int num_digits; int num_decimals; if (maybe_int != NULL) *maybe_int = 1; // Skip leading whitespace. while (isspace_ascii(*p)) p++; // Handle optional sign. negative = 0; switch (*p) { case '-': negative = 1; // Fall through to increment position. case '+': p++; } exponent = 0; num_digits = 0; num_decimals = 0; // Process string of digits. while (isdigit_ascii(*p) && num_digits <= max_int_decimal_digits) { i_number = i_number * 10 + (*p - '0'); p++; num_digits++; p += (tsep != '\0' && *p == tsep); } number = i_number; if (num_digits > max_int_decimal_digits) { // process what's left as double while (isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; p += (tsep != '\0' && *p == tsep); } } // Process decimal part. if (*p == decimal) { if (maybe_int != NULL) *maybe_int = 0; p++; while (isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; num_decimals++; } exponent -= num_decimals; } if (num_digits == 0) { *error = ERANGE; return 0.0; } // Correct for sign. if (negative) number = -number; // Process an exponent string. if (toupper_ascii(*p) == toupper_ascii(sci)) { if (maybe_int != NULL) *maybe_int = 0; // Handle optional sign. negative = 0; switch (*++p) { case '-': negative = 1; // Fall through to increment pos. case '+': p++; } // Process string of digits. num_digits = 0; n = 0; while (isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; } if (negative) exponent -= n; else exponent += n; // If no digits, after the 'e'/'E', un-consume it if (num_digits == 0) p--; } if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) { *error = ERANGE; return HUGE_VAL; } // Scale the result. p10 = 10.; n = exponent; if (n < 0) n = -n; while (n) { if (n & 1) { if (exponent < 0) number /= p10; else number *= p10; } n >>= 1; p10 *= p10; } if (number == HUGE_VAL) { *error = ERANGE; } if (skip_trailing) { // Skip trailing whitespace. while (isspace_ascii(*p)) p++; } if (endptr) *endptr = p; return number; } double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { double number; int exponent; int negative; char *p = (char *)str; int num_digits; int num_decimals; int max_digits = 17; int n; if (maybe_int != NULL) *maybe_int = 1; // Cache powers of 10 in memory. static double e[] = { 1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, 1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, 1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, 1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, 1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, 1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, 1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, 1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, 1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, 1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, 1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308}; // Skip leading whitespace. while (isspace_ascii(*p)) p++; // Handle optional sign. negative = 0; switch (*p) { case '-': negative = 1; // Fall through to increment position. case '+': p++; } number = 0.; exponent = 0; num_digits = 0; num_decimals = 0; // Process string of digits. while (isdigit_ascii(*p)) { if (num_digits < max_digits) { number = number * 10. + (*p - '0'); num_digits++; } else { ++exponent; } p++; p += (tsep != '\0' && *p == tsep); } // Process decimal part if (*p == decimal) { if (maybe_int != NULL) *maybe_int = 0; p++; while (num_digits < max_digits && isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; num_decimals++; } if (num_digits >= max_digits) // Consume extra decimal digits. while (isdigit_ascii(*p)) ++p; exponent -= num_decimals; } if (num_digits == 0) { *error = ERANGE; return 0.0; } // Correct for sign. if (negative) number = -number; // Process an exponent string. if (toupper_ascii(*p) == toupper_ascii(sci)) { if (maybe_int != NULL) *maybe_int = 0; // Handle optional sign negative = 0; switch (*++p) { case '-': negative = 1; // Fall through to increment pos. case '+': p++; } // Process string of digits. num_digits = 0; n = 0; while (num_digits < max_digits && isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; } if (negative) exponent -= n; else exponent += n; // If no digits after the 'e'/'E', un-consume it. if (num_digits == 0) p--; } if (exponent > 308) { *error = ERANGE; return HUGE_VAL; } else if (exponent > 0) { number *= e[exponent]; } else if (exponent < -308) { // Subnormal if (exponent < -616) { // Prevent invalid array access. number = 0.; } else { number /= e[-308 - exponent]; number /= e[308]; } } else { number /= e[-exponent]; } if (number == HUGE_VAL || number == -HUGE_VAL) *error = ERANGE; if (skip_trailing) { // Skip trailing whitespace. while (isspace_ascii(*p)) p++; } if (endptr) *endptr = p; return number; } /* copy a decimal number string with `decimal`, `tsep` as decimal point and thousands separator to an equivalent c-locale decimal string (striping `tsep`, replacing `decimal` with '.'). The returned memory should be free-d with a call to `free`. */ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, char tsep) { const char *p = s; size_t length = strlen(s); char *s_copy = malloc(length + 1); char *dst = s_copy; // Skip leading whitespace. while (isspace_ascii(*p)) p++; // Copy Leading sign if (*p == '+' || *p == '-') { *dst++ = *p++; } // Copy integer part dropping `tsep` while (isdigit_ascii(*p)) { *dst++ = *p++; p += (tsep != '\0' && *p == tsep); } // Replace `decimal` with '.' if (*p == decimal) { *dst++ = '.'; p++; } // Copy fractional part after decimal (if any) while (isdigit_ascii(*p)) { *dst++ = *p++; } // Copy exponent if any if (toupper_ascii(*p) == toupper_ascii('E')) { *dst++ = *p++; // Copy leading exponent sign (if any) if (*p == '+' || *p == '-') { *dst++ = *p++; } // Copy exponent digits while (isdigit_ascii(*p)) { *dst++ = *p++; } } *dst++ = '\0'; // terminate if (endpos != NULL) *endpos = (char *)p; return s_copy; } double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { // 'normalize' representation to C-locale; replace decimal with '.' and // remove t(housand)sep. char *endptr; char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); // This is called from a nogil block in parsers.pyx // so need to explicitly get GIL before Python calls PyGILState_STATE gstate; gstate = PyGILState_Ensure(); char *endpc; double r = PyOS_string_to_double(pc, &endpc, 0); // PyOS_string_to_double needs to consume the whole string if (endpc == pc + strlen(pc)) { if (q != NULL) { // report endptr from source string (p) *q = endptr; } } else { *error = -1; if (q != NULL) { // p and pc are different len due to tsep removal. Can't report // how much it has consumed of p. Just rewind to beginning. *q = (char *)p; // TODO(willayd): this could be undefined behavior } } if (maybe_int != NULL) *maybe_int = 0; if (PyErr_Occurred() != NULL) *error = -1; else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL; PyErr_Clear(); PyGILState_Release(gstate); free(pc); if (skip_trailing && q != NULL && *q != p) { while (isspace_ascii(**q)) { (*q)++; } } return r; } // End of xstrtod code // --------------------------------------------------------------------------- void uint_state_init(uint_state *self) { self->seen_sint = 0; self->seen_uint = 0; self->seen_null = 0; } int uint64_conflict(uint_state *self) { return self->seen_uint && (self->seen_sint || self->seen_null); } int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { const char *p = p_item; int isneg = 0; int64_t number = 0; int d; // Skip leading spaces. while (isspace_ascii(*p)) { ++p; } // Handle sign. if (*p == '-') { isneg = 1; ++p; } else if (*p == '+') { p++; } // Check that there is a first digit. if (!isdigit_ascii(*p)) { // Error... *error = ERROR_NO_DIGITS; return 0; } if (isneg) { // If number is greater than pre_min, at least one more digit // can be processed without overflowing. int dig_pre_min = -(int_min % 10); int64_t pre_min = int_min / 10; // Process the digits. d = *p; if (tsep != '\0') { while (1) { if (d == tsep) { d = *++p; continue; } else if (!isdigit_ascii(d)) { break; } if ((number > pre_min) || ((number == pre_min) && (d - '0' <= dig_pre_min))) { number = number * 10 - (d - '0'); d = *++p; } else { *error = ERROR_OVERFLOW; return 0; } } } else { while (isdigit_ascii(d)) { if ((number > pre_min) || ((number == pre_min) && (d - '0' <= dig_pre_min))) { number = number * 10 - (d - '0'); d = *++p; } else { *error = ERROR_OVERFLOW; return 0; } } } } else { // If number is less than pre_max, at least one more digit // can be processed without overflowing. int64_t pre_max = int_max / 10; int dig_pre_max = int_max % 10; // Process the digits. d = *p; if (tsep != '\0') { while (1) { if (d == tsep) { d = *++p; continue; } else if (!isdigit_ascii(d)) { break; } if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { number = number * 10 + (d - '0'); d = *++p; } else { *error = ERROR_OVERFLOW; return 0; } } } else { while (isdigit_ascii(d)) { if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { number = number * 10 + (d - '0'); d = *++p; } else { *error = ERROR_OVERFLOW; return 0; } } } } // Skip trailing spaces. while (isspace_ascii(*p)) { ++p; } // Did we use up all the characters? if (*p) { *error = ERROR_INVALID_CHARS; return 0; } *error = 0; return number; } uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep) { const char *p = p_item; uint64_t pre_max = uint_max / 10; int dig_pre_max = uint_max % 10; uint64_t number = 0; int d; // Skip leading spaces. while (isspace_ascii(*p)) { ++p; } // Handle sign. if (*p == '-') { state->seen_sint = 1; *error = 0; return 0; } else if (*p == '+') { p++; } // Check that there is a first digit. if (!isdigit_ascii(*p)) { // Error... *error = ERROR_NO_DIGITS; return 0; } // If number is less than pre_max, at least one more digit // can be processed without overflowing. // // Process the digits. d = *p; if (tsep != '\0') { while (1) { if (d == tsep) { d = *++p; continue; } else if (!isdigit_ascii(d)) { break; } if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { number = number * 10 + (d - '0'); d = *++p; } else { *error = ERROR_OVERFLOW; return 0; } } } else { while (isdigit_ascii(d)) { if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { number = number * 10 + (d - '0'); d = *++p; } else { *error = ERROR_OVERFLOW; return 0; } } } // Skip trailing spaces. while (isspace_ascii(*p)) { ++p; } // Did we use up all the characters? if (*p) { *error = ERROR_INVALID_CHARS; return 0; } if (number > (uint64_t)int_max) { state->seen_uint = 1; } *error = 0; return number; }