core/Lucy/Analysis/Inversion.c - lucy - Git at Google

 /* Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #define C_LUCY_INVERSION
 #define C_LUCY_TOKEN
 #include "Lucy/Util/ToolSet.h"

 #include "Lucy/Analysis/Inversion.h"
 #include "Lucy/Analysis/Token.h"
 #include "Lucy/Util/SortUtils.h"

 #ifndef SIZE_MAX
   #define SIZE_MAX ((size_t)-1)
 #endif

 // After inversion, record how many like tokens occur in each group.
 static void
 S_count_clusters(Inversion *self);

 Inversion*
 Inversion_new(Token *seed_token) {
     Inversion *self = (Inversion*)VTable_Make_Obj(INVERSION);

     // Init.
     self->cap                 = 16;
     self->size                = 0;
     self->tokens              = (Token**)CALLOCATE(self->cap, sizeof(Token*));
     self->cur                 = 0;
     self->inverted            = false;
     self->cluster_counts      = NULL;
     self->cluster_counts_size = 0;

     // Process the seed token.
     if (seed_token != NULL) {
         Inversion_append(self, (Token*)INCREF(seed_token));
     }

     return self;
 }

 void
 Inversion_destroy(Inversion *self) {
     if (self->tokens) {
         Token **tokens       = self->tokens;
         Token **const limit  = tokens + self->size;
         for (; tokens < limit; tokens++) {
             DECREF(*tokens);
         }
         FREEMEM(self->tokens);
     }
     FREEMEM(self->cluster_counts);
     SUPER_DESTROY(self, INVERSION);
 }

 uint32_t
 Inversion_get_size(Inversion *self) {
     return self->size;
 }

 Token*
 Inversion_next(Inversion *self) {
     // Kill the iteration if we're out of tokens.
     if (self->cur == self->size) {
         return NULL;
     }
     return self->tokens[self->cur++];
 }

 void
 Inversion_reset(Inversion *self) {
     self->cur = 0;
 }

 static void
 S_grow(Inversion *self, size_t size) {
     if (size > self->cap) {
         uint64_t amount = size * sizeof(Token*);
         // Clip rather than wrap.
         if (amount > SIZE_MAX || amount < size) { amount = SIZE_MAX; }
         self->tokens = (Token**)REALLOCATE(self->tokens, (size_t)amount);
         self->cap    = size;
         memset(self->tokens + self->size, 0,
                (size - self->size) * sizeof(Token*));
     }
 }

 void
 Inversion_append(Inversion *self, Token *token) {
     if (self->inverted) {
         THROW(ERR, "Can't append tokens after inversion");
     }
     if (self->size >= self->cap) {
         size_t new_capacity = Memory_oversize(self->size + 1, sizeof(Token*));
         S_grow(self, new_capacity);
     }
     self->tokens[self->size] = token;
     self->size++;
 }

 Token**
 Inversion_next_cluster(Inversion *self, uint32_t *count) {
     Token **cluster = self->tokens + self->cur;

     if (self->cur == self->size) {
         *count = 0;
         return NULL;
     }

     // Don't read past the end of the cluster counts array.
     if (!self->inverted) {
         THROW(ERR, "Inversion not yet inverted");
     }
     if (self->cur > self->cluster_counts_size) {
         THROW(ERR, "Tokens were added after inversion");
     }

     // Place cluster count in passed-in var, advance bookmark.
     *count = self->cluster_counts[self->cur];
     self->cur += *count;

     return cluster;
 }

 void
 Inversion_invert(Inversion *self) {
     Token   **tokens = self->tokens;
     Token   **limit  = tokens + self->size;
     int32_t   token_pos = 0;

     // Thwart future attempts to append.
     if (self->inverted) {
         THROW(ERR, "Inversion has already been inverted");
     }
     self->inverted = true;

     // Assign token positions.
     for (; tokens < limit; tokens++) {
         Token *const cur_token = *tokens;
         cur_token->pos = token_pos;
         token_pos += cur_token->pos_inc;
         if (token_pos < cur_token->pos) {
             THROW(ERR, "Token positions out of order: %i32 %i32",
                   cur_token->pos, token_pos);
         }
     }

     // Sort the tokens lexically, and hand off to cluster counting routine.
     Sort_quicksort(self->tokens, self->size, sizeof(Token*), Token_compare,
                    NULL);
     S_count_clusters(self);
 }

 static void
 S_count_clusters(Inversion *self) {
     Token **tokens = self->tokens;
     uint32_t *counts
         = (uint32_t*)CALLOCATE(self->size + 1, sizeof(uint32_t));

     // Save the cluster counts.
     self->cluster_counts_size = self->size;
     self->cluster_counts = counts;

     for (uint32_t i = 0; i < self->size;) {
         Token *const base_token = tokens[i];
         char  *const base_text  = base_token->text;
         const size_t base_len   = base_token->len;
         uint32_t     j          = i + 1;

         // Iterate through tokens until text doesn't match.
         while (j < self->size) {
             Token *const candidate = tokens[j];

             if ((candidate->len == base_len)
                 && (memcmp(candidate->text, base_text, base_len) == 0)
                ) {
                 j++;
             }
             else {
                 break;
             }
         }

         // Record count at the position of the first token in the cluster.
         counts[i] = j - i;

         // Start the next loop at the next token we haven't seen.
         i = j;
     }
 }
	/* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#define C_LUCY_INVERSION
	#define C_LUCY_TOKEN
	#include "Lucy/Util/ToolSet.h"

	#include "Lucy/Analysis/Inversion.h"
	#include "Lucy/Analysis/Token.h"
	#include "Lucy/Util/SortUtils.h"

	#ifndef SIZE_MAX
	#define SIZE_MAX ((size_t)-1)
	#endif

	// After inversion, record how many like tokens occur in each group.
	static void
	S_count_clusters(Inversion *self);

	Inversion*
	Inversion_new(Token *seed_token) {
	Inversion self = (Inversion)VTable_Make_Obj(INVERSION);

	// Init.
	self->cap = 16;
	self->size = 0;
	self->tokens = (Token*)CALLOCATE(self->cap, sizeof(Token));
	self->cur = 0;
	self->inverted = false;
	self->cluster_counts = NULL;
	self->cluster_counts_size = 0;

	// Process the seed token.
	if (seed_token != NULL) {
	Inversion_append(self, (Token*)INCREF(seed_token));
	}

	return self;
	}

	void
	Inversion_destroy(Inversion *self) {
	if (self->tokens) {
	Token **tokens = self->tokens;
	Token **const limit = tokens + self->size;
	for (; tokens < limit; tokens++) {
	DECREF(*tokens);
	}
	FREEMEM(self->tokens);
	}
	FREEMEM(self->cluster_counts);
	SUPER_DESTROY(self, INVERSION);
	}

	uint32_t
	Inversion_get_size(Inversion *self) {
	return self->size;
	}

	Token*
	Inversion_next(Inversion *self) {
	// Kill the iteration if we're out of tokens.
	if (self->cur == self->size) {
	return NULL;
	}
	return self->tokens[self->cur++];
	}

	void
	Inversion_reset(Inversion *self) {
	self->cur = 0;
	}

	static void
	S_grow(Inversion *self, size_t size) {
	if (size > self->cap) {
	uint64_t amount = size * sizeof(Token*);
	// Clip rather than wrap.
	if (amount > SIZE_MAX \|\| amount < size) { amount = SIZE_MAX; }
	self->tokens = (Token**)REALLOCATE(self->tokens, (size_t)amount);
	self->cap = size;
	memset(self->tokens + self->size, 0,
	(size - self->size) * sizeof(Token*));
	}
	}

	void
	Inversion_append(Inversion self, Token token) {
	if (self->inverted) {
	THROW(ERR, "Can't append tokens after inversion");
	}
	if (self->size >= self->cap) {
	size_t new_capacity = Memory_oversize(self->size + 1, sizeof(Token*));
	S_grow(self, new_capacity);
	}
	self->tokens[self->size] = token;
	self->size++;
	}

	Token**
	Inversion_next_cluster(Inversion self, uint32_t count) {
	Token **cluster = self->tokens + self->cur;

	if (self->cur == self->size) {
	*count = 0;
	return NULL;
	}

	// Don't read past the end of the cluster counts array.
	if (!self->inverted) {
	THROW(ERR, "Inversion not yet inverted");
	}
	if (self->cur > self->cluster_counts_size) {
	THROW(ERR, "Tokens were added after inversion");
	}

	// Place cluster count in passed-in var, advance bookmark.
	*count = self->cluster_counts[self->cur];
	self->cur += *count;

	return cluster;
	}

	void
	Inversion_invert(Inversion *self) {
	Token **tokens = self->tokens;
	Token **limit = tokens + self->size;
	int32_t token_pos = 0;

	// Thwart future attempts to append.
	if (self->inverted) {
	THROW(ERR, "Inversion has already been inverted");
	}
	self->inverted = true;

	// Assign token positions.
	for (; tokens < limit; tokens++) {
	Token const cur_token = tokens;
	cur_token->pos = token_pos;
	token_pos += cur_token->pos_inc;
	if (token_pos < cur_token->pos) {
	THROW(ERR, "Token positions out of order: %i32 %i32",
	cur_token->pos, token_pos);
	}
	}

	// Sort the tokens lexically, and hand off to cluster counting routine.
	Sort_quicksort(self->tokens, self->size, sizeof(Token*), Token_compare,
	NULL);
	S_count_clusters(self);
	}

	static void
	S_count_clusters(Inversion *self) {
	Token **tokens = self->tokens;
	uint32_t *counts
	= (uint32_t*)CALLOCATE(self->size + 1, sizeof(uint32_t));

	// Save the cluster counts.
	self->cluster_counts_size = self->size;
	self->cluster_counts = counts;

	for (uint32_t i = 0; i < self->size;) {
	Token *const base_token = tokens[i];
	char *const base_text = base_token->text;
	const size_t base_len = base_token->len;
	uint32_t j = i + 1;

	// Iterate through tokens until text doesn't match.
	while (j < self->size) {
	Token *const candidate = tokens[j];

	if ((candidate->len == base_len)
	&& (memcmp(candidate->text, base_text, base_len) == 0)
	) {
	j++;
	}
	else {
	break;
	}
	}

	// Record count at the position of the first token in the cluster.
	counts[i] = j - i;

	// Start the next loop at the next token we haven't seen.
	i = j;
	}
	}