src/include/commands/copy.h - hawq - Git at Google

 /*-------------------------------------------------------------------------
  *
  * copy.h
  *	  Definitions for using the POSTGRES copy command.
  *
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * $PostgreSQL: pgsql/src/include/commands/copy.h,v 1.28 2006/08/30 23:34:22 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #ifndef COPY_H
 #define COPY_H

 #include "c.h"
 #include "nodes/parsenodes.h"
 #include "tcop/dest.h"
 #include "executor/executor.h"


 /*
  * Represents the different source/dest cases we need to worry about at
  * the bottom level
  */
 typedef enum CopyDest
 {
 	COPY_FILE,					/* to/from file */
 	COPY_OLD_FE,				/* to/from frontend (2.0 protocol) */
 	COPY_NEW_FE,				/* to/from frontend (3.0 protocol) */
 	COPY_EXTERNAL_SOURCE		/* to/from external source (RET/WET) */
 } CopyDest;

 /*
  *	Represents the end-of-line terminator type of the input
  */
 typedef enum EolType
 {
 	EOL_UNKNOWN,
 	EOL_LF,
 	EOL_CR,
 	EOL_CRLF
 } EolType;

 /*
  * There are several ways to know the input row number where an error happened,
  * in order to report it reliably.
  *
  * ROWNUM_ORIGINAL - Used when the error happens in the same place that the
  * *whole* file or data is read, and input row numbers can be tracked reliably.
  * So, if an error happens in the n'th data row read, it's actually the n'th row
  * of the file (or data stream).
  * Using this type are - COPY in dispatch mode, COPY in utility mode, External
  * tables with 'file', and external web tables.
  *
  * ROWNUM_EMBEDDED - Used when the error happens in a place where a random
  * part of the file is read (given to us) and therefore the row numbers of the
  * original data are unknown.
  * So, If an error happens in the n'th row read it can actually be the (n+m)'th
  * row of the original file.
  * Using this type are - COPY in execute mode, and External tables getting data
  * from a gpfdist process in a CSV Format (gpfdist only parses CSV row by row,
  * while text format is parsed in chucks).
  * In this case we do some extra work to retrieve the original row number - the
  * distributor (COPY dispatcher, or gpfdist) embeds the original row number in
  * the beginning of each data row, and this number is extracted later on.
  *
  * BYTENUM_EMBEDDED - Original row isn't even known to the distributor, only
  * the byte offset of each chunk it sends. We report errors in byte offset
  * number, not row number. We keep track of byte counts. This is currently
  * only used by external tables with gpfdist in 'text' format.
  *
  */
 typedef enum ErrLocType
 {
 	ROWNUM_ORIGINAL,
 	ROWNUM_EMBEDDED,
 	BYTENUM_EMBEDDED
 } ErrLocType;


 /*
  * The error handling mode for this data load.
  */
 typedef enum CopyErrMode
 {
 	ALL_OR_NOTHING,	/* Either all rows or no rows get loaded (the default) */
 	SREH_IGNORE,	/* Sreh - ignore errors (REJECT but no error table) */
 	SREH_LOG		/* Sreh - log errors in an error table */
 } CopyErrMode;


 /*
  * This struct contains all the state variables used throughout a COPY
  * operation. For simplicity, we use the same struct for all variants of COPY,
  * even though some fields are used in only some cases.
  *
  * Multi-byte encodings: all supported client-side encodings encode multi-byte
  * characters by having the first byte's high bit set. Subsequent bytes of the
  * character can have the high bit not set. When scanning data in such an
  * encoding to look for a match to a single-byte (ie ASCII) character, we must
  * use the full pg_encoding_mblen() machinery to skip over multibyte
  * characters, else we might find a false match to a trailing byte. In
  * supported server encodings, there is no possibility of a false match, and
  * it's faster to make useless comparisons to trailing bytes than it is to
  * invoke pg_encoding_mblen() to skip over them. encoding_embeds_ascii is TRUE
  * when we have to do it the hard way.
  */
 typedef struct CopyStateData
 {
 	/* low-level state data */
 	CopyDest	copy_dest;		/* type of copy source/destination */
 	FILE	   *copy_file;		/* used if copy_dest == COPY_FILE */
 	StringInfo	fe_msgbuf;		/* used for all dests during COPY TO, only for
 								 * dest == COPY_NEW_FE in COPY FROM */
 	bool		fe_copy;		/* true for all FE copy dests */
 	bool		fe_eof;			/* true if detected end of copy data */
 	EolType		eol_type;		/* EOL type of input */
 	char	   *eol_str;		/* optional NEWLINE from command. before eol_type is defined */
 	int			client_encoding;	/* remote side's character encoding */
 	bool		need_transcoding;		/* client encoding diff from server? */
 	bool		encoding_embeds_ascii;	/* ASCII can be non-first byte? */
 	FmgrInfo   *enc_conversion_proc; /* conv proc from exttbl encoding to
 										server or the other way around */
 	FmgrInfo   *custom_formatter_func; /* function to convert to custom format */
 	char	   *custom_formatter_name; /* name of function to convert to custom format */
 	List	   *custom_formatter_params; /* list of defelems that hold user's format parameters */
 	uint64		processed;		/* # of tuples processed */
 	size_t		bytesread;

 	/* parameters from the COPY command */
 	Relation	rel;			/* relation to copy to or from */
 	QueryDesc  *queryDesc;		/* executable query to copy from */
 	List	   *attnumlist;		/* integer list of attnums to copy */
 	List	   *attnamelist;	/* list of attributes by name */
 	List	   *force_quote;	/* the raw fc column name list */
 	List	   *force_notnull;  /* the raw fnn column name list */
 	char	   *filename;		/* filename, or NULL for STDIN/STDOUT */
 	bool		custom;			/* custom format? */
 	bool		oids;			/* include OIDs? */
 	bool		csv_mode;		/* Comma Separated Value format? */
 	bool		header_line;	/* CSV header line? */
 	char	   *null_print;		/* NULL marker string (server encoding!) */
 	int			null_print_len; /* length of same */
 	char	   *null_print_client;		/* same converted to client encoding */
 	char	   *delim;			/* column delimiter (1 byte or multibytes) */
 	char	   *quote;			/* CSV quote char (must be 1 byte) */
 	char	   *escape;			/* CSV escape char (must be 1 byte) */
 	bool	   *force_quote_flags;		/* per-column CSV FQ flags */
 	bool	   *force_notnull_flags;	/* per-column CSV FNN flags */
 	bool		fill_missing;	/* missing attrs at end of line are NULL */

 	/* these are just for error messages, see copy_in_error_callback */
 	const char *cur_relname;	/* table name for error messages */
 	int64		cur_lineno;		/* line number for error messages.  Negative means it isn't available. */
 	int64       cur_byteno;     /* number of bytes processed from input */
 	const char *cur_attname;	/* current att for error messages */
 	//const char *cur_attval;		 /* current att value for error messages */

 	/*
 	 * Working state for COPY TO
 	 */
 	FmgrInfo   *out_functions;	/* lookup info for output functions */
 	MemoryContext rowcontext;	/* per-row evaluation context */

 	/*
 	 * These variables are used to reduce overhead in textual COPY FROM.
 	 *
 	 * attribute_buf holds the separated, de-escaped text for each field of
 	 * the current line.  The CopyReadAttributes functions return arrays of
 	 * pointers into this buffer.  We avoid palloc/pfree overhead by re-using
 	 * the buffer on each cycle.
 	 */
 	StringInfoData attribute_buf;

 	/*
 	 * Similarly, line_buf holds the whole input line being processed. The
 	 * input cycle is first to read the whole line into line_buf, convert it
 	 * to server encoding there, and then extract the individual attribute
 	 * fields into attribute_buf.  line_buf is preserved unmodified so that we
 	 * can display it in error messages if appropriate.
 	 */
 	StringInfoData line_buf;

 	int		   *attr_offsets;

 	bool		line_buf_converted;		/* converted to server encoding? */

 	/*
 	 * Finally, raw_buf holds raw data read from the data source (file or
 	 * client connection). CopyReadLine parses this data sufficiently to
 	 * locate line boundaries, then transfers the data to line_buf and
 	 * converts it.  Note: we guarantee that there is a \0 at
 	 * raw_buf[raw_buf_len].
 	 */

 #define RAW_BUF_SIZE 65536

 	/* NOTE: raw_buf in Greenplum Database is used with different logic than postgres COPY */
 	char		raw_buf[RAW_BUF_SIZE + 1];		/* extra byte for '\0' */
 	int			raw_buf_index;	/* next byte to process */
 	bool		raw_buf_done;	/* finished processing the current buffer */
 	int			missing_bytes;  /* see scanTextLine() for explanation */
 	/* Greenplum Database specific variables */
 	bool		is_copy_in;		/* copy in or out? */
 	char		eol_ch[2];		/* The byte values of the 1 or 2 eol bytes */
 	bool		escape_off;		/* treat backslashes as non-special? */
 	bool		delimiter_off;  /* no delimiter. 1-column external tabs only */
 	bool		delimiter_multibytes;
 	int			last_hash_field;
 	bool		line_done;		/* finished processing the whole line or
 								 * stopped in the middle */
 	bool		end_marker;
 	char	   *begloc;
 	char	   *endloc;
 	bool		error_on_executor;		/* true if errors arrived from the
 										 * executors COPY (QEs) */
 	StringInfoData executor_err_context;		/* error context text from QE */


 	/* for CSV format */
 	bool		in_quote;
 	bool		last_was_esc;

 	/* for TEXT format */
 	bool		esc_in_prevbuf; /* escape was last character of the data input
 								 * buffer */
 	bool		cr_in_prevbuf;	/* CR was last character of the data input
 								 * buffer */

 	/* Original row number tracking variables */
 #define COPY_METADATA_DELIM '^'
 	ErrLocType  err_loc_type;   /* see enum def for description */
 	bool		md_error;

 	/* Error handling options */
 	CopyErrMode	errMode;
 	struct CdbSreh		*cdbsreh; /* single row error handler */
 	int			num_consec_csv_err; /* # of consecutive csv invalid format errs */

 	PartitionNode *partitions; /* partitioning meta data from dispatcher */
 	List		  *ao_segnos;  /* AO table meta data from dispatcher */
 	List *ao_segfileinfos; /* AO segment file information from dispatcher */
 	List *splits;				/* table scan splits for this segment */
 	/* end Greenplum Database specific variables */
 	struct QueryResource *resource;
 	PlannedStmt *planstmt;  /* ext table copy to */
 } CopyStateData;

 typedef CopyStateData *CopyState;


 #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
 #define OCTVALUE(c) ((c) - '0')

 extern void ValidateControlChars(bool copy, bool load, bool csv_mode, char *delim,
 								 char *null_print, char *quote, char *escape,
 								 List *force_quote, List *force_notnull,
 								 bool header_line, bool fill_missing, char *newline,
 								 int numcols);
 extern uint64 DoCopy(const CopyStmt *stmt, const char *queryString);

 extern DestReceiver *CreateCopyDestReceiver(void);

 extern List *CopyGetAttnums(TupleDesc tupDesc, Relation rel, List *attnamelist);
 extern bool CopyReadLineText(CopyState cstate, size_t bytesread);
 extern bool CopyReadLineCSV(CopyState cstate, size_t bytesread);
 extern void CopyReadAttributesText(CopyState cstate, char * __restrict nulls,
 			 int * __restrict attr_offsets, int num_phys_attrs, Form_pg_attribute * __restrict attr);
 extern void CopyReadAttributesCSV(CopyState cstate, char *nulls, int *attr_offsets,
 					  int num_phys_attrs, Form_pg_attribute *attr);
 extern void CopyOneRowTo(CopyState cstate, Oid tupleOid,
 						 Datum *values, bool *nulls);
 extern void CopyOneCustomRowTo(CopyState cstate, bytea *value);
 extern void CopySendEndOfRow(CopyState cstate);
 extern void limit_printout_length(StringInfo buf);
 extern void truncateEol(StringInfo buf, EolType	eol_type);
 extern void setEncodingConversionProc(CopyState cstate, int client_encoding, bool iswritable);
 extern void CopyEolStrToType(CopyState cstate);

 #endif   /* COPY_H */
	/*-------------------------------------------------------------------------
	*
	* copy.h
	* Definitions for using the POSTGRES copy command.
	*
	*
	* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
	* Portions Copyright (c) 1994, Regents of the University of California
	*
	* $PostgreSQL: pgsql/src/include/commands/copy.h,v 1.28 2006/08/30 23:34:22 tgl Exp $
	*
	*-------------------------------------------------------------------------
	*/
	#ifndef COPY_H
	#define COPY_H

	#include "c.h"
	#include "nodes/parsenodes.h"
	#include "tcop/dest.h"
	#include "executor/executor.h"


	/*
	* Represents the different source/dest cases we need to worry about at
	* the bottom level
	*/
	typedef enum CopyDest
	{
	COPY_FILE, /* to/from file */
	COPY_OLD_FE, /* to/from frontend (2.0 protocol) */
	COPY_NEW_FE, /* to/from frontend (3.0 protocol) */
	COPY_EXTERNAL_SOURCE /* to/from external source (RET/WET) */
	} CopyDest;

	/*
	* Represents the end-of-line terminator type of the input
	*/
	typedef enum EolType
	{
	EOL_UNKNOWN,
	EOL_LF,
	EOL_CR,
	EOL_CRLF
	} EolType;

	/*
	* There are several ways to know the input row number where an error happened,
	* in order to report it reliably.
	*
	* ROWNUM_ORIGINAL - Used when the error happens in the same place that the
	* whole file or data is read, and input row numbers can be tracked reliably.
	* So, if an error happens in the n'th data row read, it's actually the n'th row
	* of the file (or data stream).
	* Using this type are - COPY in dispatch mode, COPY in utility mode, External
	* tables with 'file', and external web tables.
	*
	* ROWNUM_EMBEDDED - Used when the error happens in a place where a random
	* part of the file is read (given to us) and therefore the row numbers of the
	* original data are unknown.
	* So, If an error happens in the n'th row read it can actually be the (n+m)'th
	* row of the original file.
	* Using this type are - COPY in execute mode, and External tables getting data
	* from a gpfdist process in a CSV Format (gpfdist only parses CSV row by row,
	* while text format is parsed in chucks).
	* In this case we do some extra work to retrieve the original row number - the
	* distributor (COPY dispatcher, or gpfdist) embeds the original row number in
	* the beginning of each data row, and this number is extracted later on.
	*
	* BYTENUM_EMBEDDED - Original row isn't even known to the distributor, only
	* the byte offset of each chunk it sends. We report errors in byte offset
	* number, not row number. We keep track of byte counts. This is currently
	* only used by external tables with gpfdist in 'text' format.
	*
	*/
	typedef enum ErrLocType
	{
	ROWNUM_ORIGINAL,
	ROWNUM_EMBEDDED,
	BYTENUM_EMBEDDED
	} ErrLocType;


	/*
	* The error handling mode for this data load.
	*/
	typedef enum CopyErrMode
	{
	ALL_OR_NOTHING, /* Either all rows or no rows get loaded (the default) */
	SREH_IGNORE, /* Sreh - ignore errors (REJECT but no error table) */
	SREH_LOG /* Sreh - log errors in an error table */
	} CopyErrMode;


	/*
	* This struct contains all the state variables used throughout a COPY
	* operation. For simplicity, we use the same struct for all variants of COPY,
	* even though some fields are used in only some cases.
	*
	* Multi-byte encodings: all supported client-side encodings encode multi-byte
	* characters by having the first byte's high bit set. Subsequent bytes of the
	* character can have the high bit not set. When scanning data in such an
	* encoding to look for a match to a single-byte (ie ASCII) character, we must
	* use the full pg_encoding_mblen() machinery to skip over multibyte
	* characters, else we might find a false match to a trailing byte. In
	* supported server encodings, there is no possibility of a false match, and
	* it's faster to make useless comparisons to trailing bytes than it is to
	* invoke pg_encoding_mblen() to skip over them. encoding_embeds_ascii is TRUE
	* when we have to do it the hard way.
	*/
	typedef struct CopyStateData
	{
	/* low-level state data */
	CopyDest copy_dest; /* type of copy source/destination */
	FILE copy_file; / used if copy_dest == COPY_FILE */
	StringInfo fe_msgbuf; /* used for all dests during COPY TO, only for
	* dest == COPY_NEW_FE in COPY FROM */
	bool fe_copy; /* true for all FE copy dests */
	bool fe_eof; /* true if detected end of copy data */
	EolType eol_type; /* EOL type of input */
	char eol_str; / optional NEWLINE from command. before eol_type is defined */
	int client_encoding; /* remote side's character encoding */
	bool need_transcoding; /* client encoding diff from server? */
	bool encoding_embeds_ascii; /* ASCII can be non-first byte? */
	FmgrInfo enc_conversion_proc; / conv proc from exttbl encoding to
	server or the other way around */
	FmgrInfo custom_formatter_func; / function to convert to custom format */
	char custom_formatter_name; / name of function to convert to custom format */
	List custom_formatter_params; / list of defelems that hold user's format parameters */
	uint64 processed; /* # of tuples processed */
	size_t bytesread;

	/* parameters from the COPY command */
	Relation rel; /* relation to copy to or from */
	QueryDesc queryDesc; / executable query to copy from */
	List attnumlist; / integer list of attnums to copy */
	List attnamelist; / list of attributes by name */
	List force_quote; / the raw fc column name list */
	List force_notnull; / the raw fnn column name list */
	char filename; / filename, or NULL for STDIN/STDOUT */
	bool custom; /* custom format? */
	bool oids; /* include OIDs? */
	bool csv_mode; /* Comma Separated Value format? */
	bool header_line; /* CSV header line? */
	char null_print; / NULL marker string (server encoding!) */
	int null_print_len; /* length of same */
	char null_print_client; / same converted to client encoding */
	char delim; / column delimiter (1 byte or multibytes) */
	char quote; / CSV quote char (must be 1 byte) */
	char escape; / CSV escape char (must be 1 byte) */
	bool force_quote_flags; / per-column CSV FQ flags */
	bool force_notnull_flags; / per-column CSV FNN flags */
	bool fill_missing; /* missing attrs at end of line are NULL */

	/* these are just for error messages, see copy_in_error_callback */
	const char cur_relname; / table name for error messages */
	int64 cur_lineno; /* line number for error messages. Negative means it isn't available. */
	int64 cur_byteno; /* number of bytes processed from input */
	const char cur_attname; / current att for error messages */
	//const char cur_attval; / current att value for error messages */

	/*
	* Working state for COPY TO
	*/
	FmgrInfo out_functions; / lookup info for output functions */
	MemoryContext rowcontext; /* per-row evaluation context */

	/*
	* These variables are used to reduce overhead in textual COPY FROM.
	*
	* attribute_buf holds the separated, de-escaped text for each field of
	* the current line. The CopyReadAttributes functions return arrays of
	* pointers into this buffer. We avoid palloc/pfree overhead by re-using
	* the buffer on each cycle.
	*/
	StringInfoData attribute_buf;

	/*
	* Similarly, line_buf holds the whole input line being processed. The
	* input cycle is first to read the whole line into line_buf, convert it
	* to server encoding there, and then extract the individual attribute
	* fields into attribute_buf. line_buf is preserved unmodified so that we
	* can display it in error messages if appropriate.
	*/
	StringInfoData line_buf;

	int *attr_offsets;

	bool line_buf_converted; /* converted to server encoding? */

	/*
	* Finally, raw_buf holds raw data read from the data source (file or
	* client connection). CopyReadLine parses this data sufficiently to
	* locate line boundaries, then transfers the data to line_buf and
	* converts it. Note: we guarantee that there is a \0 at
	* raw_buf[raw_buf_len].
	*/

	#define RAW_BUF_SIZE 65536

	/* NOTE: raw_buf in Greenplum Database is used with different logic than postgres COPY */
	char raw_buf[RAW_BUF_SIZE + 1]; /* extra byte for '\0' */
	int raw_buf_index; /* next byte to process */
	bool raw_buf_done; /* finished processing the current buffer */
	int missing_bytes; /* see scanTextLine() for explanation */
	/* Greenplum Database specific variables */
	bool is_copy_in; /* copy in or out? */
	char eol_ch[2]; /* The byte values of the 1 or 2 eol bytes */
	bool escape_off; /* treat backslashes as non-special? */
	bool delimiter_off; /* no delimiter. 1-column external tabs only */
	bool delimiter_multibytes;
	int last_hash_field;
	bool line_done; /* finished processing the whole line or
	* stopped in the middle */
	bool end_marker;
	char *begloc;
	char *endloc;
	bool error_on_executor; /* true if errors arrived from the
	* executors COPY (QEs) */
	StringInfoData executor_err_context; /* error context text from QE */


	/* for CSV format */
	bool in_quote;
	bool last_was_esc;

	/* for TEXT format */
	bool esc_in_prevbuf; /* escape was last character of the data input
	* buffer */
	bool cr_in_prevbuf; /* CR was last character of the data input
	* buffer */

	/* Original row number tracking variables */
	#define COPY_METADATA_DELIM '^'
	ErrLocType err_loc_type; /* see enum def for description */
	bool md_error;

	/* Error handling options */
	CopyErrMode errMode;
	struct CdbSreh cdbsreh; / single row error handler */
	int num_consec_csv_err; /* # of consecutive csv invalid format errs */

	PartitionNode partitions; / partitioning meta data from dispatcher */
	List ao_segnos; / AO table meta data from dispatcher */
	List ao_segfileinfos; / AO segment file information from dispatcher */
	List splits; / table scan splits for this segment */
	/* end Greenplum Database specific variables */
	struct QueryResource *resource;
	PlannedStmt planstmt; / ext table copy to */
	} CopyStateData;

	typedef CopyStateData *CopyState;


	#define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
	#define OCTVALUE(c) ((c) - '0')

	extern void ValidateControlChars(bool copy, bool load, bool csv_mode, char *delim,
	char null_print, char quote, char *escape,
	List force_quote, List force_notnull,
	bool header_line, bool fill_missing, char *newline,
	int numcols);
	extern uint64 DoCopy(const CopyStmt stmt, const char queryString);

	extern DestReceiver *CreateCopyDestReceiver(void);

	extern List CopyGetAttnums(TupleDesc tupDesc, Relation rel, List attnamelist);
	extern bool CopyReadLineText(CopyState cstate, size_t bytesread);
	extern bool CopyReadLineCSV(CopyState cstate, size_t bytesread);
	extern void CopyReadAttributesText(CopyState cstate, char * __restrict nulls,
	int * __restrict attr_offsets, int num_phys_attrs, Form_pg_attribute * __restrict attr);
	extern void CopyReadAttributesCSV(CopyState cstate, char nulls, int attr_offsets,
	int num_phys_attrs, Form_pg_attribute *attr);
	extern void CopyOneRowTo(CopyState cstate, Oid tupleOid,
	Datum values, bool nulls);
	extern void CopyOneCustomRowTo(CopyState cstate, bytea *value);
	extern void CopySendEndOfRow(CopyState cstate);
	extern void limit_printout_length(StringInfo buf);
	extern void truncateEol(StringInfo buf, EolType eol_type);
	extern void setEncodingConversionProc(CopyState cstate, int client_encoding, bool iswritable);
	extern void CopyEolStrToType(CopyState cstate);

	#endif /* COPY_H */