tools/tpch/text.c - hawq - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 /*
  * $Id$
  *
  * Revision History
  * ===================
  * $Log: text.c,v $
  * Revision 1.3  2007/10/25 09:27:37  cktan
  * fixed.
  *
  * Revision 1.2  2007/10/25 05:44:25  cktan
  * more cleanup
  *
  * Revision 1.1  2007/10/24 20:25:23  cktan
  * new
  *
  * Revision 1.2  2007/04/07 08:10:40  cmcdevitt
  * Fixes for dbgen with large scale factors
  *
  * Revision 1.6  2006/07/31 17:23:09  jms
  * fix to parallelism problem
  *
  * Revision 1.5  2006/05/18 23:50:00  jms
  * commit text generation change with larger buffer
  *
  * Revision 1.4  2006/05/16 16:26:51  jms
  * remove calls to FAKE_V_STR
  *
  * Revision 1.3  2006/05/16 15:55:58  jms
  * first cut to Meikel
  *
  * Revision 1.2  2005/01/03 20:08:59  jms
  * change line terminations
  *
  * Revision 1.1.1.1  2004/11/24 23:31:47  jms
  * re-establish external server
  *
  * Revision 1.1.1.1  2003/08/07 17:58:34  jms
  * recreation after CVS crash
  *
  * Revision 1.2  2003/08/07 17:58:34  jms
  * Convery RNG to 64bit space as preparation for new large scale RNG
  *
  * Revision 1.1.1.1  2003/04/03 18:54:21  jms
  * initial checkin
  *
  *
  */
 /*
  * text.c --- pseaudo text generator for use in DBGEN 2.0
  *
  * Defined Routines:
  *		dbg_text() -- select and translate a sentance form
  */

 #ifdef TEXT_TEST
 #define DECLARER
 #endif /* TEST */

 #include "config.h"
 #include <stdlib.h>
 #if (defined(_POSIX_)||!defined(WIN32))		/* Change for Windows NT */
 #include <unistd.h>
 #include <sys/wait.h>
 #endif /* WIN32 */
 #include <stdio.h>				/* */
 #include <limits.h>
 #include <math.h>
 #include <ctype.h>
 #include <signal.h>
 #include <string.h>
 #include <errno.h>
 #ifdef HP
 #include <strings.h>
 #endif
 #if (defined(WIN32)&&!defined(_POSIX_))
 #include <process.h>
 #pragma warning(disable:4201)
 #pragma warning(disable:4214)
 #pragma warning(disable:4514)
 #define WIN32_LEAN_AND_MEAN
 #define NOATOM
 #define NOGDICAPMASKS
 #define NOMETAFILE
 #define NOMINMAX
 #define NOMSG
 #define NOOPENFILE
 #define NORASTEROPS
 #define NOSCROLL
 #define NOSOUND
 #define NOSYSMETRICS
 #define NOTEXTMETRIC
 #define NOWH
 #define NOCOMM
 #define NOKANJI
 #define NOMCX
 #include <windows.h>
 #pragma warning(default:4201)
 #pragma warning(default:4214)
 #endif

 #define TEXT_POOL_SIZE (300 * 1024 * 1024)  /* 300MiB */

 #include "dss.h"
 #include "dsstypes.h"

 /*
  * txt_vp() --
  *		generate a verb phrase by
  *		1) selecting a verb phrase form
  *		2) parsing it to select parts of speech
  *		3) selecting appropriate words
  *		4) adding punctuation as required
  *
  *	Returns: length of generated phrase
  *	Called By: txt_sentence()
  *	Calls: pick_str()
  */
 static int
     txt_vp(char *dest, int sd)
 {
     char syntax[MAX_GRAMMAR_LEN + 1],
 	*cptr,
 	*parse_target;
     distribution *src;
     int i,
 	res = 0;

     pick_str(&vp, sd, &syntax[0]);
     parse_target = syntax;
     while ((cptr = strtok(parse_target, " ")) != NULL) {
 	src = NULL;
 	switch(*cptr) {
 	case 'D':
 	    src = &adverbs;
 	    break;
 	case 'V':
 	    src = &verbs;
 	    break;
 	case 'X':
 	    src = &auxillaries;
 	    break;
 	}	/* end of POS switch statement */
 	i = pick_str(src, sd, dest);
 	i = strlen(DIST_MEMBER(src, i));
 	dest += i;
 	res += i;
 	if (*(++cptr))	{
 	    /* miscelaneous fillagree, like punctuation */
 	    dest += 1;
 	    res += 1;
 	    *dest = *cptr;
 	}
 	*dest = ' ';
 	dest++;
 	res++;
 	parse_target = NULL;
     }	/* end of while loop */

     return(res);
 }

 /*
  * txt_np() --
  *		generate a noun phrase by
  *		1) selecting a noun phrase form
  *		2) parsing it to select parts of speech
  *		3) selecting appropriate words
  *		4) adding punctuation as required
  *
  *	Returns: length of generated phrase
  *	Called By: txt_sentence()
  *	Calls: pick_str(),
  */
 static int
     txt_np(char *dest, int sd)
 {
     char syntax[MAX_GRAMMAR_LEN + 1],
 	*cptr,
 	*parse_target;
     distribution *src;
     int i,
 	res = 0;


     pick_str(&np, sd, &syntax[0]);
     parse_target = syntax;
     while ((cptr = strtok(parse_target, " ")) != NULL) {
 	src = NULL;
 	switch(*cptr) {
 	case 'A':
 	    src = &articles;
 	    break;
 	case 'J':
 	    src = &adjectives;
 	    break;
 	case 'D':
 	    src = &adverbs;
 	    break;
 	case 'N':
 	    src = &nouns;
 	    break;
 	}	/* end of POS switch statement */
 	i = pick_str(src, sd, dest);
 	i = strlen(DIST_MEMBER(src, i));
 	dest += i;
 	res += i;
 	if (*(++cptr))	{
 	    /* miscelaneous fillagree, like punctuation */
 	    *dest = *cptr;
 	    dest += 1;
 	    res += 1;
 	}
 	*dest = ' ';
 	dest++;
 	res++;
 	parse_target = NULL;
     }	/* end of while loop */

     return(res);
 }

 /*
  * txt_sentence() --
  *		generate a sentence by
  *		1) selecting a sentence form
  *		2) parsing it to select parts of speech or phrase types
  *		3) selecting appropriate words
  *		4) adding punctuation as required
  *
  *	Returns: length of generated sentence
  *	Called By: dbg_text()
  *	Calls: pick_str(), txt_np(), txt_vp()
  */
 static int
     txt_sentence(char *dest, int sd)
 {
     char syntax[MAX_GRAMMAR_LEN + 1],
 	*cptr;
     int i,
 	res = 0,
 	len = 0;


     pick_str(&grammar, sd, syntax);
     cptr = syntax;

     next_token:	/* I hate goto's, but can't seem to have parent and child use strtok() */
     while (*cptr && *cptr == ' ')
 	cptr++;
     if (*cptr == '\0')
 	goto done;
     switch(*cptr) {
     case 'V':
 	len = txt_vp(dest, sd);
 	break;
     case 'N':
 	len = txt_np(dest, sd);
 	break;
     case 'P':
 	i = pick_str(&prepositions, sd, dest);
 	len = strlen(DIST_MEMBER(&prepositions, i));
 	strcpy((dest + len), " the ");
 	len += 5;
 	len += txt_np(dest + len, sd);
 	break;
     case 'T':
 	i = pick_str(&terminators, sd, --dest); /*terminators should abut previous word */
 	len = strlen(DIST_MEMBER(&terminators, i));
 	break;
     }	/* end of POS switch statement */
     dest += len;
     res += len;
     cptr++;
     if (*cptr && *cptr != ' ')	{
 	/* miscelaneous fillagree, like punctuation */
 	dest += 1;
 	res += 1;
 	*dest = *cptr;
     }
     goto next_token;
     done:
     *dest = '\0';
     return(--res);
 }

 /*
  * dbg_text() --
  *		produce ELIZA-like text of random, bounded length, truncating the last
  *		generated sentence as required
  */
 void
     dbg_text(char *tgt, int min, int max, int sd)
 {
     DSS_HUGE hgLength = 0,
 	hgOffset,
 	wordlen = 0,
 	s_len,
 	needed;
     char sentence[MAX_SENT_LEN + 1],
 	*cp;
     static char szTextPool[TEXT_POOL_SIZE + 1];
     static int bInit = 0;
     int nLifeNoise = 0;

     if (!bInit) {
 	cp = &szTextPool[0];
 	if (o_verbose)
 	    fprintf(stderr, "\nPreloading text ... ");

 	while (wordlen < TEXT_POOL_SIZE) {
 	    if (o_verbose && (wordlen > nLifeNoise)) {
 		nLifeNoise += 200000;
 		fprintf(stderr, "%3.0f%%\b\b\b\b", (100.0 * wordlen)/TEXT_POOL_SIZE);
 	    }

 	    s_len = txt_sentence(sentence, 5);
 	    if ( s_len < 0)
 		INTERNAL_ERROR("Bad sentence formation");
 	    needed = TEXT_POOL_SIZE - wordlen;
 	    if (needed >= (s_len + 1))	{
 		/* need the entire sentence */
 		strcpy(cp, sentence);
 		cp += s_len;
 		wordlen += s_len + 1;
 		*(cp++) = ' ';
 	    }
 	    else {
 		/* chop the new sentence off to match the length target */
 		sentence[needed] = '\0';
 		strcpy(cp, sentence);
 		wordlen += needed;
 		cp += needed;
 	    }
 	}
 	*cp = '\0';
 	bInit = 1;
 	if (o_verbose)
 	    fprintf(stderr, "\n");
     }

     RANDOM(hgOffset, 0, TEXT_POOL_SIZE - max, sd);
     RANDOM(hgLength, min, max, sd);
     strncpy(&tgt[0], &szTextPool[hgOffset], (int)hgLength);
     tgt[hgLength] = '\0';

     return;
 }

 #ifdef TEXT_TEST
 tdef tdefs[1] = { NULL };
 distribution nouns,
     verbs,
     adjectives,
     adverbs,
     auxillaries,
     terminators,
     articles,
     prepositions,
     grammar,
     np,
     vp;

 main()
 {
     char prattle[401];

     verbose = 1;

     read_dist (env_config (DIST_TAG, DIST_DFLT), "nouns", &nouns);
     read_dist (env_config (DIST_TAG, DIST_DFLT), "verbs", &verbs);
     read_dist (env_config (DIST_TAG, DIST_DFLT), "adjectives", &adjectives);
     read_dist (env_config (DIST_TAG, DIST_DFLT), "adverbs", &adverbs);
     read_dist (env_config (DIST_TAG, DIST_DFLT), "auxillaries", &auxillaries);
     read_dist (env_config (DIST_TAG, DIST_DFLT), "terminators", &terminators);
     read_dist (env_config (DIST_TAG, DIST_DFLT), "articles", &articles);
     read_dist (env_config (DIST_TAG, DIST_DFLT), "prepositions", &prepositions);
     read_dist (env_config (DIST_TAG, DIST_DFLT), "grammar", &grammar);
     read_dist (env_config (DIST_TAG, DIST_DFLT), "np", &np);
     read_dist (env_config (DIST_TAG, DIST_DFLT), "vp", &vp);

     while (1) {
 	dbg_text(&prattle[0], 300, 400, 0);
 	printf("<%s>\n", prattle);
     }

     return(0);
 }
 #endif /* TEST */
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	/*
	* $Id$
	*
	* Revision History
	* ===================
	* $Log: text.c,v $
	* Revision 1.3 2007/10/25 09:27:37 cktan
	* fixed.
	*
	* Revision 1.2 2007/10/25 05:44:25 cktan
	* more cleanup
	*
	* Revision 1.1 2007/10/24 20:25:23 cktan
	* new
	*
	* Revision 1.2 2007/04/07 08:10:40 cmcdevitt
	* Fixes for dbgen with large scale factors
	*
	* Revision 1.6 2006/07/31 17:23:09 jms
	* fix to parallelism problem
	*
	* Revision 1.5 2006/05/18 23:50:00 jms
	* commit text generation change with larger buffer
	*
	* Revision 1.4 2006/05/16 16:26:51 jms
	* remove calls to FAKE_V_STR
	*
	* Revision 1.3 2006/05/16 15:55:58 jms
	* first cut to Meikel
	*
	* Revision 1.2 2005/01/03 20:08:59 jms
	* change line terminations
	*
	* Revision 1.1.1.1 2004/11/24 23:31:47 jms
	* re-establish external server
	*
	* Revision 1.1.1.1 2003/08/07 17:58:34 jms
	* recreation after CVS crash
	*
	* Revision 1.2 2003/08/07 17:58:34 jms
	* Convery RNG to 64bit space as preparation for new large scale RNG
	*
	* Revision 1.1.1.1 2003/04/03 18:54:21 jms
	* initial checkin
	*
	*
	*/
	/*
	* text.c --- pseaudo text generator for use in DBGEN 2.0
	*
	* Defined Routines:
	* dbg_text() -- select and translate a sentance form
	*/

	#ifdef TEXT_TEST
	#define DECLARER
	#endif /* TEST */

	#include "config.h"
	#include <stdlib.h>
	#if (defined(_POSIX_)\|\|!defined(WIN32)) /* Change for Windows NT */
	#include <unistd.h>
	#include <sys/wait.h>
	#endif /* WIN32 */
	#include <stdio.h> /* */
	#include <limits.h>
	#include <math.h>
	#include <ctype.h>
	#include <signal.h>
	#include <string.h>
	#include <errno.h>
	#ifdef HP
	#include <strings.h>
	#endif
	#if (defined(WIN32)&&!defined(_POSIX_))
	#include <process.h>
	#pragma warning(disable:4201)
	#pragma warning(disable:4214)
	#pragma warning(disable:4514)
	#define WIN32_LEAN_AND_MEAN
	#define NOATOM
	#define NOGDICAPMASKS
	#define NOMETAFILE
	#define NOMINMAX
	#define NOMSG
	#define NOOPENFILE
	#define NORASTEROPS
	#define NOSCROLL
	#define NOSOUND
	#define NOSYSMETRICS
	#define NOTEXTMETRIC
	#define NOWH
	#define NOCOMM
	#define NOKANJI
	#define NOMCX
	#include <windows.h>
	#pragma warning(default:4201)
	#pragma warning(default:4214)
	#endif

	#define TEXT_POOL_SIZE (300 * 1024 * 1024) /* 300MiB */

	#include "dss.h"
	#include "dsstypes.h"

	/*
	* txt_vp() --
	* generate a verb phrase by
	* 1) selecting a verb phrase form
	* 2) parsing it to select parts of speech
	* 3) selecting appropriate words
	* 4) adding punctuation as required
	*
	* Returns: length of generated phrase
	* Called By: txt_sentence()
	* Calls: pick_str()
	*/
	static int
	txt_vp(char *dest, int sd)
	{
	char syntax[MAX_GRAMMAR_LEN + 1],
	*cptr,
	*parse_target;
	distribution *src;
	int i,
	res = 0;

	pick_str(&vp, sd, &syntax[0]);
	parse_target = syntax;
	while ((cptr = strtok(parse_target, " ")) != NULL) {
	src = NULL;
	switch(*cptr) {
	case 'D':
	src = &adverbs;
	break;
	case 'V':
	src = &verbs;
	break;
	case 'X':
	src = &auxillaries;
	break;
	} /* end of POS switch statement */
	i = pick_str(src, sd, dest);
	i = strlen(DIST_MEMBER(src, i));
	dest += i;
	res += i;
	if (*(++cptr)) {
	/* miscelaneous fillagree, like punctuation */
	dest += 1;
	res += 1;
	dest = cptr;
	}
	*dest = ' ';
	dest++;
	res++;
	parse_target = NULL;
	} /* end of while loop */

	return(res);
	}

	/*
	* txt_np() --
	* generate a noun phrase by
	* 1) selecting a noun phrase form
	* 2) parsing it to select parts of speech
	* 3) selecting appropriate words
	* 4) adding punctuation as required
	*
	* Returns: length of generated phrase
	* Called By: txt_sentence()
	* Calls: pick_str(),
	*/
	static int
	txt_np(char *dest, int sd)
	{
	char syntax[MAX_GRAMMAR_LEN + 1],
	*cptr,
	*parse_target;
	distribution *src;
	int i,
	res = 0;


	pick_str(&np, sd, &syntax[0]);
	parse_target = syntax;
	while ((cptr = strtok(parse_target, " ")) != NULL) {
	src = NULL;
	switch(*cptr) {
	case 'A':
	src = &articles;
	break;
	case 'J':
	src = &adjectives;
	break;
	case 'D':
	src = &adverbs;
	break;
	case 'N':
	src = &nouns;
	break;
	} /* end of POS switch statement */
	i = pick_str(src, sd, dest);
	i = strlen(DIST_MEMBER(src, i));
	dest += i;
	res += i;
	if (*(++cptr)) {
	/* miscelaneous fillagree, like punctuation */
	dest = cptr;
	dest += 1;
	res += 1;
	}
	*dest = ' ';
	dest++;
	res++;
	parse_target = NULL;
	} /* end of while loop */

	return(res);
	}

	/*
	* txt_sentence() --
	* generate a sentence by
	* 1) selecting a sentence form
	* 2) parsing it to select parts of speech or phrase types
	* 3) selecting appropriate words
	* 4) adding punctuation as required
	*
	* Returns: length of generated sentence
	* Called By: dbg_text()
	* Calls: pick_str(), txt_np(), txt_vp()
	*/
	static int
	txt_sentence(char *dest, int sd)
	{
	char syntax[MAX_GRAMMAR_LEN + 1],
	*cptr;
	int i,
	res = 0,
	len = 0;


	pick_str(&grammar, sd, syntax);
	cptr = syntax;

	next_token: /* I hate goto's, but can't seem to have parent and child use strtok() */
	while (cptr && cptr == ' ')
	cptr++;
	if (*cptr == '\0')
	goto done;
	switch(*cptr) {
	case 'V':
	len = txt_vp(dest, sd);
	break;
	case 'N':
	len = txt_np(dest, sd);
	break;
	case 'P':
	i = pick_str(&prepositions, sd, dest);
	len = strlen(DIST_MEMBER(&prepositions, i));
	strcpy((dest + len), " the ");
	len += 5;
	len += txt_np(dest + len, sd);
	break;
	case 'T':
	i = pick_str(&terminators, sd, --dest); /terminators should abut previous word /
	len = strlen(DIST_MEMBER(&terminators, i));
	break;
	} /* end of POS switch statement */
	dest += len;
	res += len;
	cptr++;
	if (cptr && cptr != ' ') {
	/* miscelaneous fillagree, like punctuation */
	dest += 1;
	res += 1;
	dest = cptr;
	}
	goto next_token;
	done:
	*dest = '\0';
	return(--res);
	}

	/*
	* dbg_text() --
	* produce ELIZA-like text of random, bounded length, truncating the last
	* generated sentence as required
	*/
	void
	dbg_text(char *tgt, int min, int max, int sd)
	{
	DSS_HUGE hgLength = 0,
	hgOffset,
	wordlen = 0,
	s_len,
	needed;
	char sentence[MAX_SENT_LEN + 1],
	*cp;
	static char szTextPool[TEXT_POOL_SIZE + 1];
	static int bInit = 0;
	int nLifeNoise = 0;

	if (!bInit) {
	cp = &szTextPool[0];
	if (o_verbose)
	fprintf(stderr, "\nPreloading text ... ");

	while (wordlen < TEXT_POOL_SIZE) {
	if (o_verbose && (wordlen > nLifeNoise)) {
	nLifeNoise += 200000;
	fprintf(stderr, "%3.0f%%\b\b\b\b", (100.0 * wordlen)/TEXT_POOL_SIZE);
	}

	s_len = txt_sentence(sentence, 5);
	if ( s_len < 0)
	INTERNAL_ERROR("Bad sentence formation");
	needed = TEXT_POOL_SIZE - wordlen;
	if (needed >= (s_len + 1)) {
	/* need the entire sentence */
	strcpy(cp, sentence);
	cp += s_len;
	wordlen += s_len + 1;
	*(cp++) = ' ';
	}
	else {
	/* chop the new sentence off to match the length target */
	sentence[needed] = '\0';
	strcpy(cp, sentence);
	wordlen += needed;
	cp += needed;
	}
	}
	*cp = '\0';
	bInit = 1;
	if (o_verbose)
	fprintf(stderr, "\n");
	}

	RANDOM(hgOffset, 0, TEXT_POOL_SIZE - max, sd);
	RANDOM(hgLength, min, max, sd);
	strncpy(&tgt[0], &szTextPool[hgOffset], (int)hgLength);
	tgt[hgLength] = '\0';

	return;
	}

	#ifdef TEXT_TEST
	tdef tdefs[1] = { NULL };
	distribution nouns,
	verbs,
	adjectives,
	adverbs,
	auxillaries,
	terminators,
	articles,
	prepositions,
	grammar,
	np,
	vp;

	main()
	{
	char prattle[401];

	verbose = 1;

	read_dist (env_config (DIST_TAG, DIST_DFLT), "nouns", &nouns);
	read_dist (env_config (DIST_TAG, DIST_DFLT), "verbs", &verbs);
	read_dist (env_config (DIST_TAG, DIST_DFLT), "adjectives", &adjectives);
	read_dist (env_config (DIST_TAG, DIST_DFLT), "adverbs", &adverbs);
	read_dist (env_config (DIST_TAG, DIST_DFLT), "auxillaries", &auxillaries);
	read_dist (env_config (DIST_TAG, DIST_DFLT), "terminators", &terminators);
	read_dist (env_config (DIST_TAG, DIST_DFLT), "articles", &articles);
	read_dist (env_config (DIST_TAG, DIST_DFLT), "prepositions", &prepositions);
	read_dist (env_config (DIST_TAG, DIST_DFLT), "grammar", &grammar);
	read_dist (env_config (DIST_TAG, DIST_DFLT), "np", &np);
	read_dist (env_config (DIST_TAG, DIST_DFLT), "vp", &vp);

	while (1) {
	dbg_text(&prattle[0], 300, 400, 0);
	printf("<%s>\n", prattle);
	}

	return(0);
	}
	#endif /* TEST */