c/sample/indexer.c - lucy - Git at Google

 /* Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include <dirent.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #define CFISH_USE_SHORT_NAMES
 #define LUCY_USE_SHORT_NAMES
 #include "Clownfish/String.h"
 #include "Lucy/Analysis/EasyAnalyzer.h"
 #include "Lucy/Document/Doc.h"
 #include "Lucy/Index/Indexer.h"
 #include "Lucy/Plan/FullTextType.h"
 #include "Lucy/Plan/StringType.h"
 #include "Lucy/Plan/Schema.h"

 const char path_to_index[] = "lucy_index";
 const char uscon_source[]  = "../../common/sample/us_constitution";

 static Schema*
 S_create_schema() {
     // Create a new schema.
     Schema *schema = Schema_new();

     // Create an analyzer.
     String       *language = Str_newf("en");
     EasyAnalyzer *analyzer = EasyAnalyzer_new(language);

     // Specify fields.

     {
         String *field_str = Str_newf("title");
         FullTextType *type = FullTextType_new((Analyzer*)analyzer);
         Schema_Spec_Field(schema, field_str, (FieldType*)type);
         DECREF(type);
         DECREF(field_str);
     }

     {
         String *field_str = Str_newf("content");
         FullTextType *type = FullTextType_new((Analyzer*)analyzer);
         FullTextType_Set_Highlightable(type, true);
         Schema_Spec_Field(schema, field_str, (FieldType*)type);
         DECREF(type);
         DECREF(field_str);
     }

     {
         String *field_str = Str_newf("url");
         StringType *type = StringType_new();
         StringType_Set_Indexed(type, false);
         Schema_Spec_Field(schema, field_str, (FieldType*)type);
         DECREF(type);
         DECREF(field_str);
     }

     {
         String *field_str = Str_newf("category");
         StringType *type = StringType_new();
         StringType_Set_Stored(type, false);
         Schema_Spec_Field(schema, field_str, (FieldType*)type);
         DECREF(type);
         DECREF(field_str);
     }

     DECREF(analyzer);
     DECREF(language);
     return schema;
 }

 bool
 S_starts_with(const char *str, const char *prefix) {
     size_t len        = strlen(str);
     size_t prefix_len = strlen(prefix);

     return len >= prefix_len
            && memcmp(str, prefix, prefix_len) == 0;
 }

 bool
 S_ends_with(const char *str, const char *postfix) {
     size_t len         = strlen(str);
     size_t postfix_len = strlen(postfix);

     return len >= postfix_len
            && memcmp(str + len - postfix_len, postfix, postfix_len) == 0;
 }

 Doc*
 S_parse_file(const char *filename) {
     size_t bytes = strlen(uscon_source) + 1 + strlen(filename) + 1;
     char *path = (char*)malloc(bytes);
     path[0] = '\0';
     strcat(path, uscon_source);
     strcat(path, "/");
     strcat(path, filename);

     FILE *stream = fopen(path, "r");
     if (stream == NULL) {
         perror(path);
         exit(1);
     }

     char *title    = NULL;
     char *bodytext = NULL;
     if (fscanf(stream, "%m[^\r\n] %m[\x01-\x7F]", &title, &bodytext) != 2) {
         fprintf(stderr, "Can't extract title/bodytext from '%s'", path);
         exit(1);
     }

     const char *category = NULL;
     if (S_starts_with(filename, "art")) {
         category = "article";
     }
     else if (S_starts_with(filename, "amend")) {
         category = "amendment";
     }
     else if (S_starts_with(filename, "preamble")) {
         category = "preamble";
     }
     else {
         fprintf(stderr, "Can't derive category for %s", filename);
         exit(1);
     }

     Doc *doc = Doc_new(NULL, 0);

     {
         // Store 'title' field
         String *field = Str_newf("title");
         String *value = Str_new_from_utf8(title, strlen(title));
         Doc_Store(doc, field, (Obj*)value);
         DECREF(field);
         DECREF(value);
     }

     {
         // Store 'content' field
         String *field = Str_newf("content");
         String *value = Str_new_from_utf8(bodytext, strlen(bodytext));
         Doc_Store(doc, field, (Obj*)value);
         DECREF(field);
         DECREF(value);
     }

     {
         // Store 'url' field
         String *field = Str_newf("url");
         String *value = Str_new_from_utf8(filename, strlen(filename));
         Doc_Store(doc, field, (Obj*)value);
         DECREF(field);
         DECREF(value);
     }

     {
         // Store 'category' field
         String *field = Str_newf("category");
         String *value = Str_new_from_utf8(category, strlen(category));
         Doc_Store(doc, field, (Obj*)value);
         DECREF(field);
         DECREF(value);
     }

     fclose(stream);
     free(bodytext);
     free(title);
     free(path);
     return doc;
 }

 int
 main() {
     // Initialize the library.
     lucy_bootstrap_parcel();

     Schema *schema = S_create_schema();
     String *folder = Str_newf("%s", path_to_index);

     Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL,
                                    Indexer_CREATE | Indexer_TRUNCATE);

     DIR *dir = opendir(uscon_source);
     if (dir == NULL) {
         perror(uscon_source);
         return 1;
     }

     for (struct dirent *entry = readdir(dir);
          entry;
          entry = readdir(dir)) {

         if (S_ends_with(entry->d_name, ".txt")) {
             Doc *doc = S_parse_file(entry->d_name);
             Indexer_Add_Doc(indexer, doc, 1.0);
             DECREF(doc);
         }
     }

     closedir(dir);

     Indexer_Commit(indexer);

     DECREF(indexer);
     DECREF(folder);
     DECREF(schema);
     return 0;
 }
	/* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include <dirent.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	#define CFISH_USE_SHORT_NAMES
	#define LUCY_USE_SHORT_NAMES
	#include "Clownfish/String.h"
	#include "Lucy/Analysis/EasyAnalyzer.h"
	#include "Lucy/Document/Doc.h"
	#include "Lucy/Index/Indexer.h"
	#include "Lucy/Plan/FullTextType.h"
	#include "Lucy/Plan/StringType.h"
	#include "Lucy/Plan/Schema.h"

	const char path_to_index[] = "lucy_index";
	const char uscon_source[] = "../../common/sample/us_constitution";

	static Schema*
	S_create_schema() {
	// Create a new schema.
	Schema *schema = Schema_new();

	// Create an analyzer.
	String *language = Str_newf("en");
	EasyAnalyzer *analyzer = EasyAnalyzer_new(language);

	// Specify fields.

	{
	String *field_str = Str_newf("title");
	FullTextType type = FullTextType_new((Analyzer)analyzer);
	Schema_Spec_Field(schema, field_str, (FieldType*)type);
	DECREF(type);
	DECREF(field_str);
	}

	{
	String *field_str = Str_newf("content");
	FullTextType type = FullTextType_new((Analyzer)analyzer);
	FullTextType_Set_Highlightable(type, true);
	Schema_Spec_Field(schema, field_str, (FieldType*)type);
	DECREF(type);
	DECREF(field_str);
	}

	{
	String *field_str = Str_newf("url");
	StringType *type = StringType_new();
	StringType_Set_Indexed(type, false);
	Schema_Spec_Field(schema, field_str, (FieldType*)type);
	DECREF(type);
	DECREF(field_str);
	}

	{
	String *field_str = Str_newf("category");
	StringType *type = StringType_new();
	StringType_Set_Stored(type, false);
	Schema_Spec_Field(schema, field_str, (FieldType*)type);
	DECREF(type);
	DECREF(field_str);
	}

	DECREF(analyzer);
	DECREF(language);
	return schema;
	}

	bool
	S_starts_with(const char str, const char prefix) {
	size_t len = strlen(str);
	size_t prefix_len = strlen(prefix);

	return len >= prefix_len
	&& memcmp(str, prefix, prefix_len) == 0;
	}

	bool
	S_ends_with(const char str, const char postfix) {
	size_t len = strlen(str);
	size_t postfix_len = strlen(postfix);

	return len >= postfix_len
	&& memcmp(str + len - postfix_len, postfix, postfix_len) == 0;
	}

	Doc*
	S_parse_file(const char *filename) {
	size_t bytes = strlen(uscon_source) + 1 + strlen(filename) + 1;
	char path = (char)malloc(bytes);
	path[0] = '\0';
	strcat(path, uscon_source);
	strcat(path, "/");
	strcat(path, filename);

	FILE *stream = fopen(path, "r");
	if (stream == NULL) {
	perror(path);
	exit(1);
	}

	char *title = NULL;
	char *bodytext = NULL;
	if (fscanf(stream, "%m[^\r\n] %m[\x01-\x7F]", &title, &bodytext) != 2) {
	fprintf(stderr, "Can't extract title/bodytext from '%s'", path);
	exit(1);
	}

	const char *category = NULL;
	if (S_starts_with(filename, "art")) {
	category = "article";
	}
	else if (S_starts_with(filename, "amend")) {
	category = "amendment";
	}
	else if (S_starts_with(filename, "preamble")) {
	category = "preamble";
	}
	else {
	fprintf(stderr, "Can't derive category for %s", filename);
	exit(1);
	}

	Doc *doc = Doc_new(NULL, 0);

	{
	// Store 'title' field
	String *field = Str_newf("title");
	String *value = Str_new_from_utf8(title, strlen(title));
	Doc_Store(doc, field, (Obj*)value);
	DECREF(field);
	DECREF(value);
	}

	{
	// Store 'content' field
	String *field = Str_newf("content");
	String *value = Str_new_from_utf8(bodytext, strlen(bodytext));
	Doc_Store(doc, field, (Obj*)value);
	DECREF(field);
	DECREF(value);
	}

	{
	// Store 'url' field
	String *field = Str_newf("url");
	String *value = Str_new_from_utf8(filename, strlen(filename));
	Doc_Store(doc, field, (Obj*)value);
	DECREF(field);
	DECREF(value);
	}

	{
	// Store 'category' field
	String *field = Str_newf("category");
	String *value = Str_new_from_utf8(category, strlen(category));
	Doc_Store(doc, field, (Obj*)value);
	DECREF(field);
	DECREF(value);
	}

	fclose(stream);
	free(bodytext);
	free(title);
	free(path);
	return doc;
	}

	int
	main() {
	// Initialize the library.
	lucy_bootstrap_parcel();

	Schema *schema = S_create_schema();
	String *folder = Str_newf("%s", path_to_index);

	Indexer indexer = Indexer_new(schema, (Obj)folder, NULL,
	Indexer_CREATE \| Indexer_TRUNCATE);

	DIR *dir = opendir(uscon_source);
	if (dir == NULL) {
	perror(uscon_source);
	return 1;
	}

	for (struct dirent *entry = readdir(dir);
	entry;
	entry = readdir(dir)) {

	if (S_ends_with(entry->d_name, ".txt")) {
	Doc *doc = S_parse_file(entry->d_name);
	Indexer_Add_Doc(indexer, doc, 1.0);
	DECREF(doc);
	}
	}

	closedir(dir);

	Indexer_Commit(indexer);

	DECREF(indexer);
	DECREF(folder);
	DECREF(schema);
	return 0;
	}