| /* gen_diff_test_data.c -- Generate sample data to test svn's diff |
| * |
| * ==================================================================== |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * ==================================================================== |
| * |
| * |
| * This is a program to generate some pathological sample data for |
| * testing and improving diff implementations. |
| * |
| * The output is deterministic but varies based on a seed value, like |
| * that provided to a pseudo random number generator. The output |
| * length is controlled as well. Both parameters are given at the |
| * command line. The output is written to stdout. |
| * |
| * Presumably if two large outputs are generated by two runs with |
| * different seed values, it will take a diff algorithm a long time to |
| * calculate their longest common subsequence. |
| * |
| * |
| * Usage: |
| * |
| * $ gen_diff_test_data <seed> <length> |
| * |
| * |
| * Implementation notes: |
| * |
| * Rather than use the system-provided pseudo random number generator, |
| * this program implements the hailstone sequence (see [1]) to assure |
| * that users on different systems can produce same outputs when using |
| * same seed and length values. That way people don't have to send |
| * each other huge >100M files of useless junk. :-) |
| * |
| * |
| * References: |
| * |
| * [1] Hailstone sequence: See Collatz Conjecture |
| * https://en.wikipedia.org/wiki/Collatz_conjecture |
| */ |
| |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <stdint.h> |
| #include <stdarg.h> |
| #include <string.h> |
| #include <limits.h> |
| |
| |
| #define PROGRAM_VERSION "0.01" |
| |
| |
| /* starting number for a hailstone sequence |
| */ |
| static uint64_t g_seed; |
| |
| |
| /* desired length of the output (approximately; actual may be longer) |
| */ |
| static uint64_t g_length; |
| |
| |
| /* current number in the hailstone sequence |
| */ |
| static uint64_t g_curr; |
| |
| |
| /* current word in words[] array |
| */ |
| static uint64_t g_word_index; |
| |
| |
| /* number of bytes written to stdout |
| */ |
| static uint64_t g_written; |
| |
| |
| /* how much to indent lines |
| */ |
| static int g_indents; |
| |
| |
| /* a bunch of random words to print in the output |
| */ |
| static const char * words[] = { |
| "list", "exe", "MODULE", "EXE", "BIT", "database", "POINT", "link", |
| "node", "parent", "BYTE", "enumerated", "OPTION", "managed", |
| "deprecated", "point", "inheritance", "OUT", "VARIABLE", "PERL", |
| "core", "else", "provider", "IMPLEMENTATION", "ENDIANNESS", |
| "platform", "TYPE", "SCANNER", "libc", "lisp", "PROCESSOR", "path", |
| "optimisation", "NANO", "subversion", "FORTRAN", "support", "EMPTY", |
| "parser", "EXTENSION", "LOOP", "COLUMN", "resource", "end", |
| "SUBCLASS", "optimal", "silicon", "row", "EXTENSIONS", "config", |
| "EXCEPTION", "INHERITANCE", "BEGIN", "emacs", "VALLEY", "PROJECT", |
| "EXTERNAL", "version", "subclass", "array", "ABI", "OPTIMISATION", |
| "CLEAN", "ENVIRONMENT", "COL", "string", "RESOURCE", "VECTOR", |
| "true", "STANDALONE", "VAR", "cobol", "DATA", "main", "TOOL", |
| "ERROR", "IF", "drive", "errno", "artifact", "NO", "no", "DEVICE", |
| "namespace", "name", "while", "dependencies", "IOCTL", "FLOAT", |
| "SUBVERSION", "variable", "fortran", "external", "COBOL", "SILICON", |
| "table", "API", "DATABASE", "ioctl", "BUILTIN", "polymorphism", |
| "empty", "extensions", "OPTIMAL", "target", "optimization", |
| "superclass", "INTERFACE", "interface", "PREFERENCES", "FOR", "asm", |
| "var", "diagnostic", "PARALLELIZATION", "type", "xml", "linker", |
| "PROVIDER", "leaf", "valley", "LINK", "TOOLCHAIN", "false", |
| "DIAGNOSTIC", "RUNTIME", "CONFIGURATION", "CORE", "CONST", |
| "MANAGED", "LEAF", "encoding", "switch", "CASE", "ERRNO", "DEBUG", |
| "LIST", "double", "STATE", "builtin", "TARGET", "PYTHON", "SCRIPT", |
| "definitions", "file", "if", "TABLE", "SETTINGS", "compiler", |
| "ENUMERATED", "FALSE", "EXECUTABLE", "technical", "POLYMORPHISM", |
| "vector", "STUDIO", "NAME", "float", "VERSION", "exception", "TRUE", |
| "bit", "STORAGE", "INCANTATION", "endianness", "NODE", "id", "XML", |
| "DONE", "INVOCATION", "environment", "PARENT", "SUPPORT", "tool", |
| "ARRAY", "state", "project", "configuration", "const", "module", |
| "builder", "BUILDER", "parallelization", "perl", "standalone", |
| "ARTIFACT", "OPTIMIZATION", "COMPILER", "executable", |
| "DEPENDENCIES", "nil", "column", "debug", "FILE", "option", |
| "DEPRECATED", "COMMAND", "abi", "processor", "ENCODING", "command", |
| "WHILE", "LISP", "vim", "DOUBLE", "folder", "script", "EMACS", |
| "col", "DRIVE", "build", "case", "PARSER", "device", "clean", "NIL", |
| "storage", "preferences", "VIM", "END", "NAMESPACE", "data", |
| "toolchain", "STRING", "error", "description", "RELEASE", |
| "incantation", "nano", "do", "TECHNICAL", "ROW", "scanner", |
| "binary", "SUPERCLASS", "DESCRIPTION", "DO", "CONFIG", "invocation", |
| "DIRECTORY", "done", "SWITCH", "NULL", "FOLDER", "LIBC", "BUILD", |
| "ASM", "directory", "LINKER", "MAIN", "ID", "THEN", |
| "implementation", "ELSE", "PLATFORM", "PATH", "then", "connection", |
| "studio", "DEFINITIONS", "out", "null", "CONNECTION", "loop", |
| "python", "runtime", "api", "BINARY" |
| }; |
| |
| |
| /* temporary space for constructing strings |
| */ |
| static char scratchpad1[1024]; |
| static char scratchpad2[1024]; |
| |
| |
| /* something bad happened; print message and terminate execution |
| */ |
| static void |
| die(const char * s) |
| { |
| if (s) |
| { |
| fprintf(stderr, "gen_diff_test_data: %s\n", s); |
| } |
| |
| exit(1); |
| } |
| |
| |
| /* given a value, calculate next value in hailstone sequence |
| * |
| * f(n) = 3n+1 if n odd, n/2 if n even |
| */ |
| static uint64_t |
| hailstone(uint64_t n) |
| { |
| return (n & 1) ? (n * 3) + 1 : n >> 1; |
| } |
| |
| |
| /* advance global variable to next value in hailstone sequence |
| * if reached end of sequence, reseed and restart |
| */ |
| static void |
| advance(void) |
| { |
| if (g_curr == 1) |
| { |
| g_seed++; |
| g_curr = g_seed; |
| } |
| else |
| { |
| g_curr = hailstone(g_curr); |
| } |
| } |
| |
| |
| /* get another "pseudo-random" word from words[] and advance in |
| * hailstone sequence |
| */ |
| static const char * |
| word(void) |
| { |
| const char * ret; |
| |
| g_word_index += g_curr; |
| ret = words[g_word_index % (sizeof(words) / sizeof(words[0]))]; |
| |
| advance(); |
| |
| return ret; |
| } |
| |
| |
| /* get another "pseudo-random" number and advance in hailstone |
| * sequence |
| */ |
| static int |
| number(void) |
| { |
| int ret = (int) g_curr; |
| |
| advance(); |
| |
| return ret; |
| } |
| |
| |
| /* print a hopefully helpful message and then quit |
| */ |
| static void |
| usage(void) |
| { |
| fprintf(stderr, "gen_diff_test_data version %s\n\n", |
| PROGRAM_VERSION); |
| |
| fprintf(stderr, |
| "Usage: gen_diff_test_data <seed> <length>\n" |
| "Where:\n" |
| " seed - controls the content of the output\n" |
| " length - in bytes controls amount written\n" |
| " approximately; actual output could be\n" |
| " longer; can use k, m, or g suffix\n\n"); |
| |
| exit(1); |
| } |
| |
| |
| /* parse command line arguments and validate them successfully or quit |
| */ |
| static void |
| parse_args(int argc, const char * argv[]) |
| { |
| char * endptr; |
| long int val; |
| |
| if (argc != 3) |
| { |
| usage(); |
| } |
| |
| /* parse the seed value */ |
| |
| val = strtol(argv[1], &endptr, 0); |
| if ((val < 2) || (val == LONG_MAX)) |
| { |
| die("seed must be in 1 < seed < LONG_MAX"); |
| } |
| |
| if ((endptr) && (*endptr)) |
| { |
| die("unexpected stuff after seed"); |
| } |
| |
| g_seed = (uint64_t) val; |
| |
| /* parse the length value */ |
| |
| val = strtol(argv[2], &endptr, 0); |
| if ((val < 1) || (val == LONG_MAX)) |
| { |
| die("length must be in 0 < length < LONG_MAX"); |
| } |
| |
| g_length = (uint64_t) val; |
| |
| if (endptr) |
| { |
| switch (*endptr) |
| { |
| case 0: |
| break; |
| |
| case 'g': case 'G': |
| g_length <<= 10; |
| case 'm': case 'M': |
| g_length <<= 10; |
| case 'k': case 'K': |
| g_length <<= 10; |
| |
| endptr++; |
| if (*endptr) |
| { |
| die("unexpected stuff after length"); |
| } |
| |
| break; |
| |
| default: die("unknown length suffix"); |
| } |
| } |
| } |
| |
| |
| /* print a string to stdout or else! |
| */ |
| static void |
| print_or_die(const char * s, ...) |
| { |
| va_list args; |
| int ret; |
| |
| va_start(args, s); |
| ret = vfprintf(stdout, s, args); |
| va_end(args); |
| |
| if (ret < 0) |
| { |
| die("sorry, vfprintf() failed!"); |
| } |
| |
| g_written += (uint64_t) ret; |
| } |
| |
| |
| /* print a string to a buffer or else! |
| */ |
| static void |
| snprintf_or_die(char * s, size_t n, const char * fmt, ...) |
| { |
| va_list args; |
| int ret; |
| |
| va_start(args, fmt); |
| ret = vsnprintf(s, n, fmt, args); |
| va_end(args); |
| |
| if ((ret < 0) || ((size_t) ret >= n)) |
| { |
| die("sorry, vsnprintf() failed!"); |
| } |
| } |
| |
| |
| /* really lame function to "indent" by the current indent level by |
| * repeatedly printing spaces |
| */ |
| static void |
| indent_or_die(void) |
| { |
| int indents = g_indents; |
| |
| while (indents > 0) |
| { |
| print_or_die(" "); |
| indents--; |
| } |
| } |
| |
| |
| /* print an opening XML-looking tag and increase indent level |
| */ |
| static void |
| open_tag(const char * s) |
| { |
| indent_or_die(); |
| print_or_die("<%s>\n", s); |
| g_indents++; |
| } |
| |
| |
| /* safely decrease indent level and print a closing XML-looking tag |
| */ |
| static void |
| close_tag(const char * s) |
| { |
| if (g_indents > 0) |
| { |
| g_indents--; |
| } |
| |
| indent_or_die(); |
| print_or_die("</%s>\n", s); |
| } |
| |
| |
| /* on one line, print an opening XML-looking tag, possibly with |
| * params, then print some contents, then print a closing tag; does |
| * not change indent level |
| */ |
| static void |
| one_line_tag(const char * tag, const char * params, |
| const char * contents) |
| { |
| indent_or_die(); |
| |
| if ((params) && (*params)) |
| { |
| print_or_die("<%s %s>%s</%s>\n", tag, params, contents, tag); |
| } |
| else |
| { |
| print_or_die("<%s>%s</%s>\n", tag, contents, tag); |
| } |
| } |
| |
| |
| static void |
| print_thing_1(void) |
| { |
| snprintf_or_die(scratchpad1, sizeof(scratchpad1), "%s=\"%s\"", |
| word(), word()); |
| snprintf_or_die(scratchpad2, sizeof(scratchpad2), "%d", number()); |
| one_line_tag(word(), scratchpad1, scratchpad2); |
| } |
| |
| |
| static void |
| print_thing_2(void) |
| { |
| snprintf_or_die(scratchpad2, sizeof(scratchpad2), "%d", number()); |
| one_line_tag(word(), NULL, scratchpad2); |
| } |
| |
| |
| static void |
| print_thing_3(void) |
| { |
| snprintf_or_die(scratchpad2, sizeof(scratchpad2), "%s", word()); |
| one_line_tag(word(), NULL, scratchpad2); |
| } |
| |
| |
| static void |
| print_thing_4(void) |
| { |
| snprintf_or_die(scratchpad1, sizeof(scratchpad1), |
| "%s=\"%s\" %s=\"%s\" %s=\"%s\"", |
| word(), word(), word(), word(), word(), word()); |
| snprintf_or_die(scratchpad2, sizeof(scratchpad2), "%d", number()); |
| one_line_tag(word(), scratchpad1, scratchpad2); |
| } |
| |
| |
| static void |
| print_thing_x(int x) |
| { |
| if ((x + 30) >= (sizeof(words) / sizeof(words[0]))) |
| { |
| x = 0; |
| } |
| |
| snprintf_or_die(scratchpad1, sizeof(scratchpad1), |
| "%s=\"%s\" %s=\"%s\" %s=\"%s\"", |
| words[x + 5], words[x + 10], words[x + 15], |
| words[x + 20], words[x + 25], words[x + 30]); |
| snprintf_or_die(scratchpad2, sizeof(scratchpad2), "%d", x); |
| one_line_tag(word(), scratchpad1, scratchpad2); |
| } |
| |
| |
| static void |
| print_sequence_1(void) |
| { |
| open_tag("level1"); |
| |
| print_thing_1(); |
| print_thing_2(); |
| |
| open_tag("level2"); |
| |
| print_thing_3(); |
| print_thing_x(10); |
| print_thing_4(); |
| |
| open_tag("level3"); |
| |
| print_thing_1(); |
| print_thing_2(); |
| print_thing_x(35); |
| print_thing_3(); |
| |
| open_tag("level4"); |
| |
| print_thing_3(); |
| print_thing_2(); |
| print_thing_4(); |
| |
| close_tag("level4"); |
| close_tag("level3"); |
| close_tag("level2"); |
| |
| print_thing_3(); |
| |
| close_tag("level1"); |
| } |
| |
| |
| /* generate a whole bunch of output that looks like XML with pseudo |
| * random contents, but lots of similar lines; in other words, stuff |
| * to keep a diff algorithm busy for a while |
| */ |
| static void |
| generate_output(void) |
| { |
| open_tag("level0"); |
| |
| while (g_written < g_length) |
| { |
| print_sequence_1(); |
| } |
| |
| close_tag("level0"); |
| } |
| |
| |
| int |
| main(int argc, const char * argv[]) |
| { |
| parse_args(argc, argv); |
| |
| g_curr = g_seed; |
| g_word_index = 0; |
| g_written = 0; |
| g_indents = 0; |
| |
| generate_output(); |
| |
| return 0; |
| } |
| |