| /*------------------------------------------------------------------------- |
| * |
| * copyfromparse.c |
| * Parse CSV/text/binary format for COPY FROM. |
| * |
| * This file contains routines to parse the text, CSV and binary input |
| * formats. The main entry point is NextCopyFrom(), which parses the |
| * next input line and returns it as Datums. |
| * |
| * In text/CSV mode, the parsing happens in multiple stages: |
| * |
| * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf |
| * 1. 2. 3. 4. |
| * |
| * 1. CopyLoadRawBuf() reads raw data from the input file or client, and |
| * places it into 'raw_buf'. |
| * |
| * 2. CopyConvertBuf() calls the encoding conversion function to convert |
| * the data in 'raw_buf' from client to server encoding, placing the |
| * converted result in 'input_buf'. |
| * |
| * 3. CopyReadLine() parses the data in 'input_buf', one line at a time. |
| * It is responsible for finding the next newline marker, taking quote and |
| * escape characters into account according to the COPY options. The line |
| * is copied into 'line_buf', with quotes and escape characters still |
| * intact. |
| * |
| * 4. CopyReadAttributesText/CSV() function takes the input line from |
| * 'line_buf', and splits it into fields, unescaping the data as required. |
| * The fields are stored in 'attribute_buf', and 'raw_fields' array holds |
| * pointers to each field. |
| * |
| * If encoding conversion is not required, a shortcut is taken in step 2 to |
| * avoid copying the data unnecessarily. The 'input_buf' pointer is set to |
| * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data |
| * directly into 'input_buf'. CopyConvertBuf() then merely validates that |
| * the data is valid in the current encoding. |
| * |
| * In binary mode, the pipeline is much simpler. Input is loaded into |
| * into 'raw_buf', and encoding conversion is done in the datatype-specific |
| * receive functions, if required. 'input_buf' and 'line_buf' are not used, |
| * but 'attribute_buf' is used as a temporary buffer to hold one attribute's |
| * data when it's passed the receive function. |
| * |
| * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE). 'input_buf' is also |
| * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required. 'line_buf' |
| * and 'attribute_buf' are expanded on demand, to hold the longest line |
| * encountered so far. |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/commands/copyfromparse.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include <ctype.h> |
| #include <unistd.h> |
| #include <sys/stat.h> |
| |
| #include "commands/copy.h" |
| #include "commands/copyfrom_internal.h" |
| #include "commands/progress.h" |
| #include "executor/executor.h" |
| #include "libpq/libpq.h" |
| #include "libpq/pqformat.h" |
| #include "mb/pg_wchar.h" |
| #include "miscadmin.h" |
| #include "pgstat.h" |
| #include "port/pg_bswap.h" |
| #include "utils/elog.h" |
| #include "utils/memutils.h" |
| #include "utils/rel.h" |
| |
| #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7')) |
| #define OCTVALUE(c) ((c) - '0') |
| |
| /* |
| * These macros centralize code used to process line_buf and input_buf buffers. |
| * They are macros because they often do continue/break control and to avoid |
| * function call overhead in tight COPY loops. |
| * |
| * We must use "if (1)" because the usual "do {...} while(0)" wrapper would |
| * prevent the continue/break processing from working. We end the "if (1)" |
| * with "else ((void) 0)" to ensure the "if" does not unintentionally match |
| * any "else" in the calling code, and to avoid any compiler warnings about |
| * empty statements. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros. |
| */ |
| |
| /* |
| * This keeps the character read at the top of the loop in the buffer |
| * even if there is more than one read-ahead. |
| */ |
| #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \ |
| if (1) \ |
| { \ |
| if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \ |
| { \ |
| input_buf_ptr = prev_raw_ptr; /* undo fetch */ \ |
| need_data = true; \ |
| continue; \ |
| } \ |
| } else ((void) 0) |
| |
| /* This consumes the remainder of the buffer and breaks */ |
| #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \ |
| if (1) \ |
| { \ |
| if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \ |
| { \ |
| if (extralen) \ |
| input_buf_ptr = copy_buf_len; /* consume the partial character */ \ |
| /* backslash just before EOF, treat as data char */ \ |
| result = true; \ |
| break; \ |
| } \ |
| } else ((void) 0) |
| |
| /* |
| * Transfer any approved data to line_buf; must do this to be sure |
| * there is some room in input_buf. |
| */ |
| #define REFILL_LINEBUF \ |
| if (1) \ |
| { \ |
| if (input_buf_ptr > cstate->input_buf_index) \ |
| { \ |
| appendBinaryStringInfo(&cstate->line_buf, \ |
| cstate->input_buf + cstate->input_buf_index, \ |
| input_buf_ptr - cstate->input_buf_index); \ |
| cstate->input_buf_index = input_buf_ptr; \ |
| } \ |
| } else ((void) 0) |
| |
| /* Undo any read-ahead and jump out of the block. */ |
| #define NO_END_OF_COPY_GOTO \ |
| if (1) \ |
| { \ |
| input_buf_ptr = prev_raw_ptr + 1; \ |
| goto not_end_of_copy; \ |
| } else ((void) 0) |
| |
| /* NOTE: there's a copy of this in copyto.c */ |
| static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0"; |
| |
| /* Low-level communications functions */ |
| static void CopyLoadInputBuf(CopyFromState cstate); |
| static void RemoveInvalidDataInBuf(CopyFromState cstate); |
| static bool FindEolInUnverifyRawBuf(const CopyFromState cstate, int *scanidx); |
| |
| void |
| ReceiveCopyBegin(CopyFromState cstate) |
| { |
| StringInfoData buf; |
| int natts = list_length(cstate->attnumlist); |
| int16 format = (cstate->opts.binary ? 1 : 0); |
| int i; |
| |
| pq_beginmessage(&buf, 'G'); |
| pq_sendbyte(&buf, format); /* overall format */ |
| pq_sendint16(&buf, natts); |
| for (i = 0; i < natts; i++) |
| pq_sendint16(&buf, format); /* per-column formats */ |
| pq_endmessage(&buf); |
| cstate->copy_src = COPY_FRONTEND; |
| cstate->fe_msgbuf = makeStringInfo(); |
| /* We *must* flush here to ensure FE knows it can send. */ |
| pq_flush(); |
| } |
| |
| void |
| ReceiveCopyBinaryHeader(CopyFromState cstate) |
| { |
| char readSig[11]; |
| int32 tmp; |
| |
| /* Signature */ |
| if (CopyReadBinaryData(cstate, readSig, 11) != 11 || |
| memcmp(readSig, BinarySignature, 11) != 0) |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("COPY file signature not recognized"))); |
| /* Flags field */ |
| if (!CopyGetInt32(cstate, &tmp)) |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("invalid COPY file header (missing flags)"))); |
| if ((tmp & (1 << 16)) != 0) |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("invalid COPY file header (WITH OIDS)"))); |
| tmp &= ~(1 << 16); |
| if ((tmp >> 16) != 0) |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("unrecognized critical flags in COPY file header"))); |
| /* Header extension length */ |
| if (!CopyGetInt32(cstate, &tmp) || |
| tmp < 0) |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("invalid COPY file header (missing length)"))); |
| /* Skip extension header, if present */ |
| while (tmp-- > 0) |
| { |
| if (CopyReadBinaryData(cstate, readSig, 1) != 1) |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("invalid COPY file header (wrong length)"))); |
| } |
| } |
| |
| /* |
| * CopyGetData reads data from the source (file or frontend) |
| * |
| * We attempt to read at least minread, and at most maxread, bytes from |
| * the source. The actual number of bytes read is returned; if this is |
| * less than minread, EOF was detected. |
| * |
| * Note: when copying from the frontend, we expect a proper EOF mark per |
| * protocol; if the frontend simply drops the connection, we raise error. |
| * It seems unwise to allow the COPY IN to complete normally in that case. |
| * |
| * NB: no data conversion is applied here. |
| */ |
| int |
| CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread) |
| { |
| int bytesread = 0; |
| |
| switch (cstate->copy_src) |
| { |
| case COPY_FILE: |
| bytesread = fread(databuf, 1, maxread, cstate->copy_file); |
| if (ferror(cstate->copy_file)) |
| { |
| if (cstate->is_program) |
| { |
| int olderrno = errno; |
| if (cstate->copy_file) |
| { |
| fclose(cstate->copy_file); |
| cstate->copy_file = NULL; |
| } |
| close_program_pipes(cstate->program_pipes, true); |
| |
| /* |
| * If close_program_pipes() didn't throw an error, |
| * the program terminated normally, but closed the |
| * pipe first. Restore errno, and throw an error. |
| */ |
| errno = olderrno; |
| |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not read from COPY program: %m"))); |
| } |
| else |
| ereport(ERROR, |
| (errcode_for_file_access(), |
| errmsg("could not read from COPY file: %m"))); |
| } |
| if (bytesread == 0) |
| cstate->raw_reached_eof = true; |
| break; |
| case COPY_FRONTEND: |
| while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof) |
| { |
| int avail; |
| |
| while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len) |
| { |
| /* Try to receive another message */ |
| int mtype; |
| int maxmsglen; |
| |
| readmessage: |
| HOLD_CANCEL_INTERRUPTS(); |
| pq_startmsgread(); |
| mtype = pq_getbyte(); |
| if (mtype == EOF) |
| ereport(ERROR, |
| (errcode(ERRCODE_CONNECTION_FAILURE), |
| errmsg("unexpected EOF on client connection with an open transaction"))); |
| /* Validate message type and set packet size limit */ |
| switch (mtype) |
| { |
| case 'd': /* CopyData */ |
| maxmsglen = PQ_LARGE_MESSAGE_LIMIT; |
| break; |
| case 'c': /* CopyDone */ |
| case 'f': /* CopyFail */ |
| case 'H': /* Flush */ |
| case 'S': /* Sync */ |
| maxmsglen = PQ_SMALL_MESSAGE_LIMIT; |
| break; |
| default: |
| ereport(ERROR, |
| (errcode(ERRCODE_PROTOCOL_VIOLATION), |
| errmsg("unexpected message type 0x%02X during COPY from stdin", |
| mtype))); |
| maxmsglen = 0; /* keep compiler quiet */ |
| break; |
| } |
| /* Now collect the message body */ |
| if (pq_getmessage(cstate->fe_msgbuf, maxmsglen)) |
| ereport(ERROR, |
| (errcode(ERRCODE_CONNECTION_FAILURE), |
| errmsg("unexpected EOF on client connection with an open transaction"))); |
| RESUME_CANCEL_INTERRUPTS(); |
| /* ... and process it */ |
| switch (mtype) |
| { |
| case 'd': /* CopyData */ |
| break; |
| case 'c': /* CopyDone */ |
| /* COPY IN correctly terminated by frontend */ |
| cstate->raw_reached_eof = true; |
| return bytesread; |
| case 'f': /* CopyFail */ |
| ereport(ERROR, |
| (errcode(ERRCODE_QUERY_CANCELED), |
| errmsg("COPY from stdin failed: %s", |
| pq_getmsgstring(cstate->fe_msgbuf)))); |
| break; |
| case 'H': /* Flush */ |
| case 'S': /* Sync */ |
| |
| /* |
| * Ignore Flush/Sync for the convenience of client |
| * libraries (such as libpq) that may send those |
| * without noticing that the command they just |
| * sent was COPY. |
| */ |
| goto readmessage; |
| default: |
| Assert(false); /* NOT REACHED */ |
| } |
| } |
| avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor; |
| if (avail > maxread) |
| avail = maxread; |
| pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail); |
| databuf = (void *) ((char *) databuf + avail); |
| maxread -= avail; |
| bytesread += avail; |
| } |
| break; |
| case COPY_CALLBACK: |
| bytesread = cstate->data_source_cb(databuf, minread, maxread, |
| cstate->data_source_cb_extra); |
| break; |
| } |
| |
| return bytesread; |
| } |
| |
| /* |
| * Perform encoding conversion on data in 'raw_buf', writing the converted |
| * data into 'input_buf'. |
| * |
| * On entry, there must be some data to convert in 'raw_buf'. |
| */ |
| static void |
| CopyConvertBuf(CopyFromState cstate) |
| { |
| /* |
| * If the file and server encoding are the same, no encoding conversion is |
| * required. However, we still need to verify that the input is valid for |
| * the encoding. |
| */ |
| if (!cstate->need_transcoding) |
| { |
| /* |
| * When conversion is not required, input_buf and raw_buf are the |
| * same. raw_buf_len is the total number of bytes in the buffer, and |
| * input_buf_len tracks how many of those bytes have already been |
| * verified. |
| */ |
| int preverifiedlen = cstate->input_buf_len; |
| int unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len; |
| int nverified; |
| |
| if (unverifiedlen == 0) |
| { |
| /* |
| * If no more raw data is coming, report the EOF to the caller. |
| */ |
| if (cstate->raw_reached_eof) |
| cstate->input_reached_eof = true; |
| return; |
| } |
| |
| /* |
| * Verify the new data, including any residual unverified bytes from |
| * previous round. |
| */ |
| nverified = pg_encoding_verifymbstr(cstate->file_encoding, |
| cstate->raw_buf + preverifiedlen, |
| unverifiedlen); |
| if (nverified == 0) |
| { |
| /* |
| * Could not verify anything. |
| * |
| * If there is no more raw input data coming, it means that there |
| * was an incomplete multi-byte sequence at the end. Also, if |
| * there's "enough" input left, we should be able to verify at |
| * least one character, and a failure to do so means that we've |
| * hit an invalid byte sequence. |
| */ |
| if (cstate->raw_reached_eof || unverifiedlen >= pg_encoding_max_length(cstate->file_encoding)) |
| cstate->input_reached_error = true; |
| return; |
| } |
| cstate->input_buf_len += nverified; |
| } |
| else |
| { |
| /* |
| * Encoding conversion is needed. |
| */ |
| int nbytes; |
| unsigned char *src; |
| int srclen; |
| unsigned char *dst; |
| int dstlen; |
| int convertedlen; |
| |
| if (RAW_BUF_BYTES(cstate) == 0) |
| { |
| /* |
| * If no more raw data is coming, report the EOF to the caller. |
| */ |
| if (cstate->raw_reached_eof) |
| cstate->input_reached_eof = true; |
| return; |
| } |
| |
| /* |
| * First, copy down any unprocessed data. |
| */ |
| nbytes = INPUT_BUF_BYTES(cstate); |
| if (nbytes > 0 && cstate->input_buf_index > 0) |
| memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index, |
| nbytes); |
| cstate->input_buf_index = 0; |
| cstate->input_buf_len = nbytes; |
| cstate->input_buf[nbytes] = '\0'; |
| |
| src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index; |
| srclen = cstate->raw_buf_len - cstate->raw_buf_index; |
| dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len; |
| dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1; |
| |
| /* |
| * Do the conversion. This might stop short, if there is an invalid |
| * byte sequence in the input. We'll convert as much as we can in |
| * that case. |
| * |
| * Note: Even if we hit an invalid byte sequence, we don't report the |
| * error until all the valid bytes have been consumed. The input |
| * might contain an end-of-input marker (\.), and we don't want to |
| * report an error if the invalid byte sequence is after the |
| * end-of-input marker. We might unnecessarily convert some data |
| * after the end-of-input marker as long as it's valid for the |
| * encoding, but that's harmless. |
| */ |
| convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc, |
| cstate->file_encoding, |
| GetDatabaseEncoding(), |
| src, srclen, |
| dst, dstlen, |
| true); |
| if (convertedlen == 0) |
| { |
| /* |
| * Could not convert anything. If there is no more raw input data |
| * coming, it means that there was an incomplete multi-byte |
| * sequence at the end. Also, if there is plenty of input left, |
| * we should be able to convert at least one character, so a |
| * failure to do so must mean that we've hit a byte sequence |
| * that's invalid. |
| */ |
| if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH) |
| cstate->input_reached_error = true; |
| return; |
| } |
| cstate->raw_buf_index += convertedlen; |
| cstate->input_buf_len += strlen((char *) dst); |
| } |
| } |
| |
| /* |
| * Report an encoding or conversion error. |
| */ |
| static void |
| CopyConversionError(CopyFromState cstate) |
| { |
| Assert(cstate->raw_buf_len > 0); |
| Assert(cstate->input_reached_error); |
| |
| if (!cstate->need_transcoding) |
| { |
| /* |
| * Everything up to input_buf_len was successfully verified, and |
| * input_buf_len points to the invalid or incomplete character. |
| */ |
| report_invalid_encoding(cstate->file_encoding, |
| cstate->raw_buf + cstate->input_buf_len, |
| cstate->raw_buf_len - cstate->input_buf_len); |
| } |
| else |
| { |
| /* |
| * raw_buf_index points to the invalid or untranslatable character. We |
| * let the conversion routine report the error, because it can provide |
| * a more specific error message than we could here. An earlier call |
| * to the conversion routine in CopyConvertBuf() detected that there |
| * is an error, now we call the conversion routine again with |
| * noError=false, to have it throw the error. |
| */ |
| unsigned char *src; |
| int srclen; |
| unsigned char *dst; |
| int dstlen; |
| |
| src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index; |
| srclen = cstate->raw_buf_len - cstate->raw_buf_index; |
| dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len; |
| dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1; |
| |
| (void) pg_do_encoding_conversion_buf(cstate->conversion_proc, |
| cstate->file_encoding, |
| GetDatabaseEncoding(), |
| src, srclen, |
| dst, dstlen, |
| false); |
| |
| /* |
| * The conversion routine should have reported an error, so this |
| * should not be reached. |
| */ |
| elog(ERROR, "encoding conversion failed without error"); |
| } |
| } |
| |
| /* |
| * Load more data from data source to raw_buf. |
| * |
| * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the |
| * beginning of the buffer, and we load new data after that. |
| */ |
| static void |
| CopyLoadRawBuf(CopyFromState cstate) |
| { |
| int nbytes; |
| int inbytes; |
| |
| /* |
| * In text mode, if encoding conversion is not required, raw_buf and |
| * input_buf point to the same buffer. Their len/index better agree, too. |
| */ |
| if (cstate->raw_buf == cstate->input_buf) |
| { |
| Assert(!cstate->need_transcoding); |
| Assert(cstate->raw_buf_index == cstate->input_buf_index); |
| Assert(cstate->input_buf_len <= cstate->raw_buf_len); |
| } |
| |
| /* |
| * Copy down the unprocessed data if any. |
| */ |
| nbytes = RAW_BUF_BYTES(cstate); |
| if (nbytes > 0 && cstate->raw_buf_index > 0) |
| memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index, |
| nbytes); |
| cstate->raw_buf_len -= cstate->raw_buf_index; |
| cstate->raw_buf_index = 0; |
| |
| /* |
| * If raw_buf and input_buf are in fact the same buffer, adjust the |
| * input_buf variables, too. |
| */ |
| if (cstate->raw_buf == cstate->input_buf) |
| { |
| cstate->input_buf_len -= cstate->input_buf_index; |
| cstate->input_buf_index = 0; |
| } |
| |
| /* Load more data */ |
| inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len, |
| 1, RAW_BUF_SIZE - cstate->raw_buf_len); |
| nbytes += inbytes; |
| cstate->raw_buf[nbytes] = '\0'; |
| cstate->raw_buf_len = nbytes; |
| |
| cstate->bytes_processed += inbytes; |
| pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed); |
| |
| if (inbytes == 0) |
| cstate->raw_reached_eof = true; |
| } |
| |
| /* |
| * CopyLoadInputBuf loads some more data into input_buf |
| * |
| * On return, at least one more input character is loaded into |
| * input_buf, or input_reached_eof is set. |
| * |
| * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start |
| * of the buffer and then we load more data after that. |
| */ |
| static void |
| CopyLoadInputBuf(CopyFromState cstate) |
| { |
| int nbytes = INPUT_BUF_BYTES(cstate); |
| |
| /* |
| * The caller has updated input_buf_index to indicate how much of the |
| * input has been consumed and isn't needed anymore. If input_buf is the |
| * same physical area as raw_buf, update raw_buf_index accordingly. |
| */ |
| if (cstate->raw_buf == cstate->input_buf) |
| { |
| Assert(!cstate->need_transcoding); |
| Assert(cstate->input_buf_index >= cstate->raw_buf_index); |
| cstate->raw_buf_index = cstate->input_buf_index; |
| } |
| |
| for (;;) |
| { |
| /* If we now have some unconverted data, try to convert it */ |
| sreh_conversion_error: |
| CopyConvertBuf(cstate); |
| |
| /* If we now have some more input bytes ready, return them */ |
| if (INPUT_BUF_BYTES(cstate) > nbytes) |
| return; |
| |
| /* |
| * If we reached an invalid byte sequence, or we're at an incomplete |
| * multi-byte character but there is no more raw input data, report |
| * conversion error. |
| */ |
| if (cstate->input_reached_error) |
| { |
| if (cstate->cdbsreh) |
| { |
| MemoryContext oldcontext = CurrentMemoryContext; |
| PG_TRY(); |
| { |
| CopyConversionError(cstate); |
| } |
| PG_CATCH(); |
| { |
| HandleCopyError(cstate); |
| RemoveInvalidDataInBuf(cstate); |
| MemoryContextSwitchTo(oldcontext); |
| } |
| PG_END_TRY(); |
| /* clean illegal character do conversion again */ |
| goto sreh_conversion_error; |
| } |
| else |
| CopyConversionError(cstate); |
| } |
| |
| /* no more input, and everything has been converted */ |
| if (cstate->input_reached_eof) |
| break; |
| |
| /* Try to load more raw data */ |
| Assert(!cstate->raw_reached_eof); |
| CopyLoadRawBuf(cstate); |
| |
| while (cstate->find_eol_with_rawreading && !cstate->raw_reached_eof) |
| { |
| int seekid; |
| if (FindEolInUnverifyRawBuf(cstate, &seekid)) |
| { |
| cstate->raw_buf_index += seekid; |
| nbytes = RAW_BUF_BYTES(cstate); |
| if (nbytes > 0 && cstate->raw_buf_index > 0) |
| memmove(cstate->raw_buf, |
| cstate->raw_buf + cstate->raw_buf_index, nbytes); |
| |
| cstate->raw_buf_len -= cstate->raw_buf_index; |
| cstate->raw_buf_index = 0; |
| cstate->find_eol_with_rawreading = false; |
| } |
| else |
| { |
| /* |
| * if EOL can not find, continuous read raw page until eof or |
| * find out. |
| */ |
| CopyLoadRawBuf(cstate); |
| } |
| } |
| } |
| } |
| |
| /* |
| * CopyReadBinaryData |
| * |
| * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf |
| * and writes them to 'dest'. Returns the number of bytes read (which |
| * would be less than 'nbytes' only if we reach EOF). |
| */ |
| int |
| CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes) |
| { |
| int copied_bytes = 0; |
| |
| if (RAW_BUF_BYTES(cstate) >= nbytes) |
| { |
| /* Enough bytes are present in the buffer. */ |
| memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes); |
| cstate->raw_buf_index += nbytes; |
| copied_bytes = nbytes; |
| } |
| else |
| { |
| /* |
| * Not enough bytes in the buffer, so must read from the file. Need |
| * to loop since 'nbytes' could be larger than the buffer size. |
| */ |
| do |
| { |
| int copy_bytes; |
| |
| /* Load more data if buffer is empty. */ |
| if (RAW_BUF_BYTES(cstate) == 0) |
| { |
| CopyLoadRawBuf(cstate); |
| if (cstate->raw_reached_eof) |
| break; /* EOF */ |
| } |
| |
| /* Transfer some bytes. */ |
| copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate)); |
| memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes); |
| cstate->raw_buf_index += copy_bytes; |
| dest += copy_bytes; |
| copied_bytes += copy_bytes; |
| } while (copied_bytes < nbytes); |
| } |
| |
| return copied_bytes; |
| } |
| |
| /* |
| * Read the next input line and stash it in line_buf. |
| * |
| * Result is true if read was terminated by EOF, false if terminated |
| * by newline. The terminating newline or EOF marker is not included |
| * in the final value of line_buf. |
| */ |
| bool |
| CopyReadLine(CopyFromState cstate) |
| { |
| bool result; |
| |
| resetStringInfo(&cstate->line_buf); |
| cstate->line_buf_valid = false; |
| |
| /* Parse data and transfer into line_buf */ |
| result = CopyReadLineText(cstate); |
| |
| if (result) |
| { |
| /* |
| * Reached EOF. In protocol version 3, we should ignore anything |
| * after \. up to the protocol end of copy data. (XXX maybe better |
| * not to treat \. as special?) |
| */ |
| if (cstate->copy_src == COPY_FRONTEND) |
| { |
| int inbytes; |
| |
| do |
| { |
| inbytes = CopyGetData(cstate, cstate->input_buf, |
| 1, INPUT_BUF_SIZE); |
| } while (inbytes > 0); |
| cstate->input_buf_index = 0; |
| cstate->input_buf_len = 0; |
| cstate->raw_buf_index = 0; |
| cstate->raw_buf_len = 0; |
| } |
| } |
| else |
| { |
| /* |
| * If we didn't hit EOF, then we must have transferred the EOL marker |
| * to line_buf along with the data. Get rid of it. |
| */ |
| switch (cstate->eol_type) |
| { |
| case EOL_NL: |
| Assert(cstate->line_buf.len >= 1); |
| Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n'); |
| cstate->line_buf.len--; |
| cstate->line_buf.data[cstate->line_buf.len] = '\0'; |
| break; |
| case EOL_CR: |
| Assert(cstate->line_buf.len >= 1); |
| Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r'); |
| cstate->line_buf.len--; |
| cstate->line_buf.data[cstate->line_buf.len] = '\0'; |
| break; |
| case EOL_CRNL: |
| Assert(cstate->line_buf.len >= 2); |
| Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r'); |
| Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n'); |
| cstate->line_buf.len -= 2; |
| cstate->line_buf.data[cstate->line_buf.len] = '\0'; |
| break; |
| case EOL_UNKNOWN: |
| /* shouldn't get here */ |
| Assert(false); |
| break; |
| } |
| } |
| |
| /* Now it's safe to use the buffer in error messages */ |
| cstate->line_buf_valid = true; |
| |
| return result; |
| } |
| |
| /* |
| * CopyReadLineText - inner loop of CopyReadLine for text mode |
| */ |
| bool |
| CopyReadLineText(CopyFromState cstate) |
| { |
| char *copy_input_buf; |
| int input_buf_ptr; |
| int copy_buf_len; |
| bool need_data = false; |
| bool hit_eof = false; |
| bool result = false; |
| |
| /* CSV variables */ |
| bool first_char_in_line = true; |
| bool in_quote = false, |
| last_was_esc = false; |
| char quotec = '\0'; |
| char escapec = '\0'; |
| |
| if (cstate->opts.csv_mode) |
| { |
| quotec = cstate->opts.quote[0]; |
| escapec = cstate->opts.escape[0]; |
| /* ignore special escape processing if it's the same as quotec */ |
| if (quotec == escapec) |
| escapec = '\0'; |
| } |
| |
| /* |
| * The objective of this loop is to transfer the entire next input line |
| * into line_buf. Hence, we only care for detecting newlines (\r and/or |
| * \n) and the end-of-copy marker (\.). |
| * |
| * In CSV mode, \r and \n inside a quoted field are just part of the data |
| * value and are put in line_buf. We keep just enough state to know if we |
| * are currently in a quoted field or not. |
| * |
| * These four characters, and the CSV escape and quote characters, are |
| * assumed the same in frontend and backend encodings. |
| * |
| * For speed, we try to move data from raw_buf to line_buf in chunks |
| * rather than one character at a time. input_buf_ptr points to the next |
| * character to examine; any characters from raw_buf_index to input_buf_ptr |
| * have been determined to be part of the line, but not yet transferred to |
| * line_buf. |
| * |
| * For a little extra speed within the loop, we copy raw_buf and |
| * raw_buf_len into local variables. |
| */ |
| copy_input_buf = cstate->input_buf; |
| input_buf_ptr = cstate->input_buf_index; |
| copy_buf_len = cstate->input_buf_len; |
| |
| for (;;) |
| { |
| int prev_raw_ptr; |
| char c; |
| |
| /* |
| * Load more data if needed. Ideally we would just force four bytes |
| * of read-ahead and avoid the many calls to |
| * IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(), but the COPY_OLD_FE protocol |
| * does not allow us to read too far ahead or we might read into the |
| * next data, so we read-ahead only as far we know we can. One |
| * optimization would be to read-ahead four byte here if |
| * cstate->copy_src != COPY_FRONTEND, but it hardly seems worth it, |
| * considering the size of the buffer. |
| */ |
| if (input_buf_ptr >= copy_buf_len || need_data) |
| { |
| REFILL_LINEBUF; |
| |
| CopyLoadInputBuf(cstate); |
| |
| /* update our local variables */ |
| hit_eof = cstate->input_reached_eof; |
| input_buf_ptr = cstate->input_buf_index; |
| copy_buf_len = cstate->input_buf_len; |
| |
| /* |
| * If we are completely out of data, break out of the loop, |
| * reporting EOF. |
| */ |
| if (INPUT_BUF_BYTES(cstate) <= 0) |
| { |
| result = true; |
| break; |
| } |
| need_data = false; |
| } |
| |
| /* OK to fetch a character */ |
| prev_raw_ptr = input_buf_ptr; |
| c = copy_input_buf[input_buf_ptr++]; |
| |
| if (cstate->opts.csv_mode) |
| { |
| /* |
| * If character is '\\' or '\r', we may need to look ahead below. |
| * Force fetch of the next character if we don't already have it. |
| * We need to do this before changing CSV state, in case one of |
| * these characters is also the quote or escape character. |
| * |
| * Note: old-protocol does not like forced prefetch, but it's OK |
| * here since we cannot validly be at EOF. |
| */ |
| if (c == '\\' || c == '\r') |
| { |
| IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); |
| } |
| |
| /* |
| * Dealing with quotes and escapes here is mildly tricky. If the |
| * quote char is also the escape char, there's no problem - we |
| * just use the char as a toggle. If they are different, we need |
| * to ensure that we only take account of an escape inside a |
| * quoted field and immediately preceding a quote char, and not |
| * the second in an escape-escape sequence. |
| */ |
| if (in_quote && c == escapec) |
| last_was_esc = !last_was_esc; |
| if (c == quotec && !last_was_esc) |
| in_quote = !in_quote; |
| if (c != escapec) |
| last_was_esc = false; |
| |
| /* |
| * Updating the line count for embedded CR and/or LF chars is |
| * necessarily a little fragile - this test is probably about the |
| * best we can do. (XXX it's arguable whether we should do this |
| * at all --- is cur_lineno a physical or logical count?) |
| */ |
| if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r')) |
| cstate->cur_lineno++; |
| } |
| |
| /* Process \r */ |
| if (c == '\r' && (!cstate->opts.csv_mode || !in_quote)) |
| { |
| /* Check for \r\n on first line, _and_ handle \r\n. */ |
| if (cstate->eol_type == EOL_UNKNOWN || |
| cstate->eol_type == EOL_CRNL) |
| { |
| /* |
| * If need more data, go back to loop top to load it. |
| * |
| * Note that if we are at EOF, c will wind up as '\0' because |
| * of the guaranteed pad of raw_buf. |
| */ |
| IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); |
| |
| /* get next char */ |
| c = copy_input_buf[input_buf_ptr]; |
| |
| if (c == '\n') |
| { |
| input_buf_ptr++; /* eat newline */ |
| cstate->eol_type = EOL_CRNL; /* in case not set yet */ |
| |
| /* |
| * GPDB: end of line. Since we don't error out if we find a |
| * bare CR or LF in CRLF mode, break here instead. |
| */ |
| break; |
| } |
| else |
| { |
| /* |
| * GPDB_91_MERGE_FIXME: these commented-out blocks (as well |
| * as the restructured newline checks) are here because we |
| * allow the user to manually set the newline mode, and |
| * therefore don't error out on bare CR/LF in the middle of |
| * a column. Instead, they will be included verbatim. |
| * |
| * This probably has other fallout -- but so does changing |
| * the behavior. Discuss. |
| */ |
| #if 0 |
| /* found \r, but no \n */ |
| if (cstate->eol_type == EOL_CRNL) |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| !cstate->opts.csv_mode ? |
| errmsg("literal carriage return found in data") : |
| errmsg("unquoted carriage return found in data"), |
| !cstate->opts.csv_mode ? |
| errhint("Use \"\\r\" to represent carriage return.") : |
| errhint("Use quoted CSV field to represent carriage return."))); |
| #endif |
| |
| /* GPDB: only reset eol_type if it's currently unknown. */ |
| if (cstate->eol_type == EOL_UNKNOWN && (cstate->opts.eol_type == EOL_UNKNOWN || cstate->opts.eol_type == EOL_CR)) |
| { |
| /* |
| * if we got here, it is the first line and we didn't find |
| * \n, so don't consume the peeked character |
| */ |
| cstate->eol_type = EOL_CR; |
| } |
| } |
| } |
| #if 0 /* GPDB_91_MERGE_FIXME: see above. */ |
| else if (cstate->eol_type == EOL_NL) |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| !cstate->opts.csv_mode ? |
| errmsg("literal carriage return found in data") : |
| errmsg("unquoted carriage return found in data"), |
| !cstate->opts.csv_mode ? |
| errhint("Use \"\\r\" to represent carriage return.") : |
| errhint("Use quoted CSV field to represent carriage return."))); |
| #endif |
| /* GPDB: a CR only ends the line in CR mode. */ |
| if (cstate->eol_type == EOL_CR) |
| { |
| /* If reach here, we have found the line terminator */ |
| break; |
| } |
| } |
| |
| /* Process \n */ |
| if (c == '\n' && (!cstate->opts.csv_mode || !in_quote)) |
| { |
| #if 0 /* GPDB_91_MERGE_FIXME: see above. */ |
| if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL) |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| !cstate->opts.csv_mode ? |
| errmsg("literal newline found in data") : |
| errmsg("unquoted newline found in data"), |
| !cstate->opts.csv_mode ? |
| errhint("Use \"\\n\" to represent newline.") : |
| errhint("Use quoted CSV field to represent newline."))); |
| #endif |
| /* GPDB: only reset eol_type if it's currently unknown. */ |
| if (cstate->eol_type == EOL_UNKNOWN && (cstate->opts.eol_type == EOL_UNKNOWN || cstate->opts.eol_type == EOL_NL)) |
| cstate->eol_type = EOL_NL; /* in case not set yet */ |
| |
| /* GPDB: a LF only ends the line in LF mode. */ |
| if (cstate->eol_type == EOL_NL) |
| { |
| /* If reach here, we have found the line terminator */ |
| break; |
| } |
| } |
| |
| /* |
| * In CSV mode, we only recognize \. alone on a line. This is because |
| * \. is a valid CSV data value. |
| */ |
| if (c == '\\' && (!cstate->opts.csv_mode || first_char_in_line)) |
| { |
| char c2; |
| |
| IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); |
| IF_NEED_REFILL_AND_EOF_BREAK(0); |
| |
| /* ----- |
| * get next character |
| * Note: we do not change c so if it isn't \., we can fall |
| * through and continue processing for file encoding. |
| * ----- |
| */ |
| c2 = copy_input_buf[input_buf_ptr]; |
| |
| /* |
| * We need to recognize the EOL. |
| * Github issue: https://github.com/greenplum-db/gpdb/issues/12454 |
| */ |
| if(c2 == '\n') |
| { |
| if(cstate->eol_type == EOL_UNKNOWN) |
| { |
| /* We still do not found the first EOL. |
| * The current '\n' will be recongnized as EOL |
| * in next loop of c1. |
| */ |
| goto not_end_of_copy; |
| } |
| else if(cstate->eol_type == EOL_NL) |
| { |
| // found a new line with '\n' |
| input_buf_ptr++; |
| break; |
| } |
| } |
| if (c2 == '\r') |
| { |
| if(cstate->eol_type == EOL_UNKNOWN) |
| { |
| goto not_end_of_copy; |
| } |
| else if(cstate->eol_type == EOL_CR) |
| { |
| // found a new line wirh '\r' |
| input_buf_ptr++; |
| break; |
| } |
| else if(cstate->eol_type == EOL_CRNL) |
| { |
| /* |
| * Because the eol is '\r\n', we need another character c3 |
| * which comes after c2 if exists. |
| */ |
| char c3; |
| input_buf_ptr++; |
| IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); |
| IF_NEED_REFILL_AND_EOF_BREAK(0); |
| c3 = copy_input_buf[input_buf_ptr]; |
| if(c3 == '\n') |
| { |
| // found a new line with '\r\n' |
| input_buf_ptr++; |
| break; |
| } else { |
| NO_END_OF_COPY_GOTO; |
| } |
| } |
| } |
| if (c2 == '.') |
| { |
| input_buf_ptr++; /* consume the '.' */ |
| |
| /* |
| * Note: if we loop back for more data here, it does not |
| * matter that the CSV state change checks are re-executed; we |
| * will come back here with no important state changed. |
| */ |
| if (cstate->eol_type == EOL_CRNL) |
| { |
| /* Get the next character */ |
| IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); |
| /* if hit_eof, c2 will become '\0' */ |
| c2 = copy_input_buf[input_buf_ptr++]; |
| |
| if (c2 == '\n') |
| { |
| if (!cstate->opts.csv_mode) |
| { |
| cstate->input_buf_index = input_buf_ptr; |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("end-of-copy marker does not match previous newline style"))); |
| } |
| else |
| NO_END_OF_COPY_GOTO; |
| } |
| else if (c2 != '\r') |
| { |
| if (!cstate->opts.csv_mode) |
| { |
| cstate->input_buf_index = input_buf_ptr; |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("end-of-copy marker corrupt"))); |
| } |
| else |
| NO_END_OF_COPY_GOTO; |
| } |
| } |
| |
| /* Get the next character */ |
| IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); |
| /* if hit_eof, c2 will become '\0' */ |
| c2 = copy_input_buf[input_buf_ptr++]; |
| |
| if (c2 != '\r' && c2 != '\n') |
| { |
| if (!cstate->opts.csv_mode) |
| { |
| cstate->input_buf_index = input_buf_ptr; |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("end-of-copy marker corrupt"))); |
| } |
| else |
| NO_END_OF_COPY_GOTO; |
| } |
| |
| if ((cstate->eol_type == EOL_NL && c2 != '\n') || |
| (cstate->eol_type == EOL_CRNL && c2 != '\n') || |
| (cstate->eol_type == EOL_CR && c2 != '\r')) |
| { |
| cstate->input_buf_index = input_buf_ptr; |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("end-of-copy marker does not match previous newline style"))); |
| } |
| |
| /* |
| * Transfer only the data before the \. into line_buf, then |
| * discard the data and the \. sequence. |
| */ |
| if (prev_raw_ptr > cstate->input_buf_index) |
| appendBinaryStringInfo(&cstate->line_buf, |
| cstate->raw_buf + cstate->input_buf_index, |
| prev_raw_ptr - cstate->input_buf_index); |
| cstate->input_buf_index = input_buf_ptr; |
| result = true; /* report EOF */ |
| break; |
| } |
| else if (!cstate->opts.csv_mode) |
| { |
| /* |
| * If we are here, it means we found a backslash followed by |
| * something other than a period. In non-CSV mode, anything |
| * after a backslash is special, so we skip over that second |
| * character too. If we didn't do that \\. would be |
| * considered an eof-of copy, while in non-CSV mode it is a |
| * literal backslash followed by a period. In CSV mode, |
| * backslashes are not special, so we want to process the |
| * character after the backslash just like a normal character, |
| * so we don't increment in those cases. |
| */ |
| input_buf_ptr++; |
| } |
| } |
| /* |
| * This label is for CSV cases where \. appears at the start of a |
| * line, but there is more text after it, meaning it was a data value. |
| * We are more strict for \. in CSV mode because \. could be a data |
| * value, while in non-CSV mode, \. cannot be a data value. |
| */ |
| not_end_of_copy: |
| first_char_in_line = false; |
| } /* end of outer loop */ |
| |
| /* |
| * Transfer any still-uncopied data to line_buf. |
| */ |
| REFILL_LINEBUF; |
| |
| return result; |
| } |
| |
| /* |
| * Return decimal value for a hexadecimal digit |
| */ |
| static int |
| GetDecimalFromHex(char hex) |
| { |
| if (isdigit((unsigned char) hex)) |
| return hex - '0'; |
| else |
| return tolower((unsigned char) hex) - 'a' + 10; |
| } |
| |
| /* |
| * Parse the current line into separate attributes (fields), |
| * performing de-escaping as needed. |
| * |
| * The input is in line_buf. We use attribute_buf to hold the result |
| * strings. cstate->raw_fields[k] is set to point to the k'th attribute |
| * string, or NULL when the input matches the null marker string. |
| * This array is expanded as necessary. |
| * |
| * (Note that the caller cannot check for nulls since the returned |
| * string would be the post-de-escaping equivalent, which may look |
| * the same as some valid data string.) |
| * |
| * delim is the column delimiter string (must be just one byte for now). |
| * null_print is the null marker string. Note that this is compared to |
| * the pre-de-escaped input string. |
| * |
| * The return value is the number of fields actually read. |
| */ |
| int |
| CopyReadAttributesText(CopyFromState cstate, int stop_processing_at_field) |
| { |
| char delimc = cstate->opts.delim[0]; |
| char escapec = cstate->opts.escape[0]; |
| bool delim_off = cstate->opts.delim_off; |
| int fieldno; |
| char *output_ptr; |
| char *cur_ptr; |
| char *line_end_ptr; |
| |
| /* |
| * We need a special case for zero-column tables: check that the input |
| * line is empty, and return. |
| */ |
| if (cstate->max_fields <= 0) |
| { |
| if (cstate->line_buf.len != 0) |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("extra data after last expected column"))); |
| return 0; |
| } |
| |
| resetStringInfo(&cstate->attribute_buf); |
| |
| /* |
| * The de-escaped attributes will certainly not be longer than the input |
| * data line, so we can just force attribute_buf to be large enough and |
| * then transfer data without any checks for enough space. We need to do |
| * it this way because enlarging attribute_buf mid-stream would invalidate |
| * pointers already stored into cstate->raw_fields[]. |
| */ |
| if (cstate->attribute_buf.maxlen <= cstate->line_buf.len) |
| enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len); |
| output_ptr = cstate->attribute_buf.data; |
| |
| /* set pointer variables for loop */ |
| cur_ptr = cstate->line_buf.data + cstate->line_buf.cursor; |
| line_end_ptr = cstate->line_buf.data + cstate->line_buf.len; |
| |
| /* Outer loop iterates over fields */ |
| fieldno = 0; |
| for (;;) |
| { |
| bool found_delim = false; |
| char *start_ptr; |
| char *end_ptr; |
| int input_len; |
| bool saw_non_ascii = false; |
| |
| /* |
| * In QD, stop once we have processed the last field we need in the QD. |
| */ |
| if (fieldno == stop_processing_at_field) |
| { |
| cstate->stopped_processing_at_delim = true; |
| break; |
| } |
| |
| /* Make sure there is enough space for the next value */ |
| if (fieldno >= cstate->max_fields) |
| { |
| cstate->max_fields *= 2; |
| cstate->raw_fields = |
| repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *)); |
| } |
| |
| /* Remember start of field on both input and output sides */ |
| start_ptr = cur_ptr; |
| cstate->raw_fields[fieldno] = output_ptr; |
| |
| /* |
| * Scan data for field. |
| * |
| * Note that in this loop, we are scanning to locate the end of field |
| * and also speculatively performing de-escaping. Once we find the |
| * end-of-field, we can match the raw field contents against the null |
| * marker string. Only after that comparison fails do we know that |
| * de-escaping is actually the right thing to do; therefore we *must |
| * not* throw any syntax errors before we've done the null-marker |
| * check. |
| */ |
| for (;;) |
| { |
| char c; |
| |
| end_ptr = cur_ptr; |
| if (cur_ptr >= line_end_ptr) |
| break; |
| c = *cur_ptr++; |
| if (c == delimc && !delim_off) |
| { |
| found_delim = true; |
| break; |
| } |
| if (c == escapec && !cstate->escape_off) |
| { |
| if (cur_ptr >= line_end_ptr) |
| break; |
| c = *cur_ptr++; |
| switch (c) |
| { |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| { |
| /* handle \013 */ |
| int val; |
| |
| val = OCTVALUE(c); |
| if (cur_ptr < line_end_ptr) |
| { |
| c = *cur_ptr; |
| if (ISOCTAL(c)) |
| { |
| cur_ptr++; |
| val = (val << 3) + OCTVALUE(c); |
| if (cur_ptr < line_end_ptr) |
| { |
| c = *cur_ptr; |
| if (ISOCTAL(c)) |
| { |
| cur_ptr++; |
| val = (val << 3) + OCTVALUE(c); |
| } |
| } |
| } |
| } |
| c = val & 0377; |
| if (c == '\0' || IS_HIGHBIT_SET(c)) |
| saw_non_ascii = true; |
| } |
| break; |
| case 'x': |
| /* Handle \x3F */ |
| if (cur_ptr < line_end_ptr) |
| { |
| char hexchar = *cur_ptr; |
| |
| if (isxdigit((unsigned char) hexchar)) |
| { |
| int val = GetDecimalFromHex(hexchar); |
| |
| cur_ptr++; |
| if (cur_ptr < line_end_ptr) |
| { |
| hexchar = *cur_ptr; |
| if (isxdigit((unsigned char) hexchar)) |
| { |
| cur_ptr++; |
| val = (val << 4) + GetDecimalFromHex(hexchar); |
| } |
| } |
| c = val & 0xff; |
| if (c == '\0' || IS_HIGHBIT_SET(c)) |
| saw_non_ascii = true; |
| } |
| } |
| break; |
| case 'b': |
| c = '\b'; |
| break; |
| case 'f': |
| c = '\f'; |
| break; |
| case 'n': |
| c = '\n'; |
| break; |
| case 'r': |
| c = '\r'; |
| break; |
| case 't': |
| c = '\t'; |
| break; |
| case 'v': |
| c = '\v'; |
| break; |
| |
| /* |
| * in all other cases, take the char after '\' |
| * literally |
| */ |
| } |
| } |
| |
| /* Add c to output string */ |
| *output_ptr++ = c; |
| } |
| |
| /* Check whether raw input matched null marker */ |
| input_len = end_ptr - start_ptr; |
| if (input_len == cstate->opts.null_print_len && |
| strncmp(start_ptr, cstate->opts.null_print, input_len) == 0) |
| cstate->raw_fields[fieldno] = NULL; |
| else |
| { |
| /* |
| * At this point we know the field is supposed to contain data. |
| * |
| * If we de-escaped any non-7-bit-ASCII chars, make sure the |
| * resulting string is valid data for the db encoding. |
| */ |
| if (saw_non_ascii) |
| { |
| char *fld = cstate->raw_fields[fieldno]; |
| |
| pg_verifymbstr(fld, output_ptr - fld, false); |
| } |
| } |
| |
| /* Terminate attribute value in output area */ |
| *output_ptr++ = '\0'; |
| |
| fieldno++; |
| /* Done if we hit EOL instead of a delim */ |
| if (!found_delim) |
| { |
| cstate->stopped_processing_at_delim = false; |
| break; |
| } |
| } |
| |
| /* |
| * Make note of the stopping point in 'line_buf.cursor', so that we |
| * can send the rest to the QE later. |
| */ |
| cstate->line_buf.cursor = cur_ptr - cstate->line_buf.data; |
| |
| /* Clean up state of attribute_buf */ |
| output_ptr--; |
| Assert(*output_ptr == '\0'); |
| cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data); |
| |
| return fieldno; |
| } |
| |
| /* |
| * Parse the current line into separate attributes (fields), |
| * performing de-escaping as needed. This has exactly the same API as |
| * CopyReadAttributesText, except we parse the fields according to |
| * "standard" (i.e. common) CSV usage. |
| */ |
| int |
| CopyReadAttributesCSV(CopyFromState cstate, int stop_processing_at_field) |
| { |
| char delimc = cstate->opts.delim[0]; |
| bool delim_off = cstate->opts.delim_off; |
| char quotec = cstate->opts.quote[0]; |
| char escapec = cstate->opts.escape[0]; |
| int fieldno; |
| char *output_ptr; |
| char *cur_ptr; |
| char *line_end_ptr; |
| |
| /* |
| * We need a special case for zero-column tables: check that the input |
| * line is empty, and return. |
| */ |
| if (cstate->max_fields <= 0) |
| { |
| if (cstate->line_buf.len != 0) |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("extra data after last expected column"))); |
| return 0; |
| } |
| |
| resetStringInfo(&cstate->attribute_buf); |
| |
| /* |
| * The de-escaped attributes will certainly not be longer than the input |
| * data line, so we can just force attribute_buf to be large enough and |
| * then transfer data without any checks for enough space. We need to do |
| * it this way because enlarging attribute_buf mid-stream would invalidate |
| * pointers already stored into cstate->raw_fields[]. |
| */ |
| if (cstate->attribute_buf.maxlen <= cstate->line_buf.len) |
| enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len); |
| output_ptr = cstate->attribute_buf.data; |
| |
| /* set pointer variables for loop */ |
| cur_ptr = cstate->line_buf.data + cstate->line_buf.cursor; |
| line_end_ptr = cstate->line_buf.data + cstate->line_buf.len; |
| |
| /* Outer loop iterates over fields */ |
| fieldno = 0; |
| for (;;) |
| { |
| bool found_delim = false; |
| bool saw_quote = false; |
| char *start_ptr; |
| char *end_ptr; |
| int input_len; |
| |
| /* |
| * In QD, stop once we have processed the last field we need in the QD. |
| */ |
| if (fieldno == stop_processing_at_field) |
| { |
| cstate->stopped_processing_at_delim = true; |
| break; |
| } |
| |
| /* Make sure there is enough space for the next value */ |
| if (fieldno >= cstate->max_fields) |
| { |
| cstate->max_fields *= 2; |
| cstate->raw_fields = |
| repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *)); |
| } |
| |
| /* Remember start of field on both input and output sides */ |
| start_ptr = cur_ptr; |
| cstate->raw_fields[fieldno] = output_ptr; |
| |
| /* |
| * Scan data for field, |
| * |
| * The loop starts in "not quote" mode and then toggles between that |
| * and "in quote" mode. The loop exits normally if it is in "not |
| * quote" mode and a delimiter or line end is seen. |
| */ |
| for (;;) |
| { |
| char c; |
| |
| /* Not in quote */ |
| for (;;) |
| { |
| end_ptr = cur_ptr; |
| if (cur_ptr >= line_end_ptr) |
| goto endfield; |
| c = *cur_ptr++; |
| /* unquoted field delimiter */ |
| if (c == delimc && !delim_off) |
| { |
| found_delim = true; |
| goto endfield; |
| } |
| /* start of quoted field (or part of field) */ |
| if (c == quotec) |
| { |
| saw_quote = true; |
| break; |
| } |
| /* Add c to output string */ |
| *output_ptr++ = c; |
| } |
| |
| /* In quote */ |
| for (;;) |
| { |
| end_ptr = cur_ptr; |
| if (cur_ptr >= line_end_ptr) |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("unterminated CSV quoted field"))); |
| |
| c = *cur_ptr++; |
| |
| /* escape within a quoted field */ |
| if (c == escapec) |
| { |
| /* |
| * peek at the next char if available, and escape it if it |
| * is an escape char or a quote char |
| */ |
| if (cur_ptr < line_end_ptr) |
| { |
| char nextc = *cur_ptr; |
| |
| if (nextc == escapec || nextc == quotec) |
| { |
| *output_ptr++ = nextc; |
| cur_ptr++; |
| continue; |
| } |
| } |
| } |
| |
| /* |
| * end of quoted field. Must do this test after testing for |
| * escape in case quote char and escape char are the same |
| * (which is the common case). |
| */ |
| if (c == quotec) |
| break; |
| |
| /* Add c to output string */ |
| *output_ptr++ = c; |
| } |
| } |
| endfield: |
| |
| /* Terminate attribute value in output area */ |
| *output_ptr++ = '\0'; |
| |
| /* Check whether raw input matched null marker */ |
| input_len = end_ptr - start_ptr; |
| if (!saw_quote && input_len == cstate->opts.null_print_len && |
| strncmp(start_ptr, cstate->opts.null_print, input_len) == 0) |
| cstate->raw_fields[fieldno] = NULL; |
| |
| fieldno++; |
| /* Done if we hit EOL instead of a delim */ |
| if (!found_delim) |
| { |
| cstate->stopped_processing_at_delim = false; |
| break; |
| } |
| } |
| |
| /* |
| * Make note of the stopping point in 'line_buf.cursor', so that we |
| * can send the rest to the QE later. |
| */ |
| cstate->line_buf.cursor = cur_ptr - cstate->line_buf.data; |
| |
| /* Clean up state of attribute_buf */ |
| output_ptr--; |
| Assert(*output_ptr == '\0'); |
| cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data); |
| |
| return fieldno; |
| } |
| |
| /* |
| * Read a binary attribute |
| */ |
| Datum |
| CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo, |
| Oid typioparam, int32 typmod, |
| bool *isnull) |
| { |
| int32 fld_size; |
| Datum result; |
| |
| if (!CopyGetInt32(cstate, &fld_size)) |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("unexpected EOF in COPY data"))); |
| if (fld_size == -1) |
| { |
| *isnull = true; |
| return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod); |
| } |
| if (fld_size < 0) |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("invalid field size"))); |
| |
| /* reset attribute_buf to empty, and load raw data in it */ |
| resetStringInfo(&cstate->attribute_buf); |
| |
| enlargeStringInfo(&cstate->attribute_buf, fld_size); |
| if (CopyReadBinaryData(cstate, cstate->attribute_buf.data, fld_size) != fld_size) |
| ereport(ERROR, |
| (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
| errmsg("unexpected EOF in COPY data"))); |
| |
| cstate->attribute_buf.len = fld_size; |
| cstate->attribute_buf.data[fld_size] = '\0'; |
| |
| /* Call the column type's binary input converter */ |
| result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf, |
| typioparam, typmod); |
| |
| /* Trouble if it didn't eat the whole buffer */ |
| if (cstate->attribute_buf.cursor != cstate->attribute_buf.len) |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION), |
| errmsg("incorrect binary data format"))); |
| |
| *isnull = false; |
| return result; |
| } |
| |
| void |
| RemoveInvalidDataInBuf(CopyFromState cstate) |
| { |
| int scanidx; |
| |
| if (cstate->errMode == ALL_OR_NOTHING) |
| CopyConversionError(cstate); |
| |
| /* reach EOF, do not need clean */ |
| if (cstate->input_reached_eof) |
| return; |
| |
| if (!cstate->need_transcoding) |
| { |
| int nbytes; |
| |
| /* |
| * According to `BeginCopyFrom`, if not need_transcoding these two |
| * pointer share one memory space. |
| */ |
| Assert(cstate->raw_buf == cstate->input_buf); |
| |
| if (FindEolInUnverifyRawBuf(cstate, &scanidx)) |
| { |
| cstate->raw_buf_index += scanidx; |
| nbytes = RAW_BUF_BYTES(cstate); |
| if (nbytes > 0 && cstate->raw_buf_index > 0) |
| memmove(cstate->raw_buf, |
| cstate->raw_buf + cstate->raw_buf_index, nbytes); |
| |
| cstate->raw_buf_len -= cstate->raw_buf_index; |
| cstate->raw_buf_index = 0; |
| } |
| else |
| { |
| /* Current page can not find eol, to skip current raw buffer */ |
| cstate->raw_buf_len = 0; |
| cstate->raw_buf_index = 0; |
| |
| /* leave a hint to identify find eol after next raw page read */ |
| cstate->find_eol_with_rawreading = true; |
| } |
| } |
| else |
| { |
| /* |
| * Transcoding case: raw_buf and input_buf are separate buffers. |
| * Skip the bad line in raw_buf by finding the next EOL. No need to |
| * memmove raw_buf here; CopyLoadRawBuf() will compact it when more |
| * raw data is needed. |
| */ |
| if (FindEolInUnverifyRawBuf(cstate, &scanidx)) |
| { |
| cstate->raw_buf_index += scanidx; |
| } |
| else |
| { |
| /* Current page can not find eol, to skip current raw buffer */ |
| cstate->raw_buf_len = 0; |
| cstate->raw_buf_index = 0; |
| |
| /* leave a hint to identify find eol after next raw page read */ |
| cstate->find_eol_with_rawreading = true; |
| } |
| } |
| |
| /* reset input buf, so we can redo conversion/verification */ |
| cstate->input_reached_error = false; |
| cstate->input_buf_index = 0; |
| cstate->input_buf_len = 0; |
| |
| /* reset line_buf */ |
| resetStringInfo(&cstate->line_buf); |
| cstate->line_buf_valid = false; |
| } |
| |
| static bool |
| FindEolInUnverifyRawBuf(const CopyFromState cstate, int *scanidx) |
| { |
| bool found = false; |
| EolType eol_type = cstate->opts.eol_type; |
| |
| for (*scanidx = 0; *scanidx < RAW_BUF_BYTES(cstate); (*scanidx)++) |
| { |
| if ('\r' == cstate->raw_buf[cstate->raw_buf_index + *scanidx]) |
| { |
| if (eol_type == EOL_CR || eol_type == EOL_UNKNOWN) |
| { |
| *scanidx += 1; |
| found = true; |
| break; |
| } |
| |
| if (eol_type == EOL_CRNL || eol_type == EOL_UNKNOWN) |
| { |
| if (*scanidx + 1 < RAW_BUF_BYTES(cstate) && |
| '\n' == cstate->raw_buf[cstate->raw_buf_index + *scanidx + 1]) |
| { |
| *scanidx += 2; |
| found = true; |
| break; |
| } |
| } |
| } |
| else if ('\n' == cstate->raw_buf[cstate->raw_buf_index + *scanidx]) |
| { |
| if (eol_type == EOL_NL || eol_type == EOL_UNKNOWN) |
| { |
| *scanidx += 1; |
| found = true; |
| break; |
| } |
| } |
| } |
| |
| return found; |
| } |