perl/xs/Lucy/Analysis/RegexTokenizer.c - lucy - Git at Google

 /* Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #define C_LUCY_REGEXTOKENIZER
 #define C_LUCY_TOKEN
 #include "XSBind.h"

 #include "Lucy/Analysis/RegexTokenizer.h"
 #include "Lucy/Analysis/Token.h"
 #include "Lucy/Analysis/Inversion.h"
 #include "Lucy/Object/Host.h"
 #include "Lucy/Util/Memory.h"
 #include "Lucy/Util/StringHelper.h"

 static void
 S_set_token_re_but_not_pattern(lucy_RegexTokenizer *self, void *token_re);

 static void
 S_set_pattern_from_token_re(lucy_RegexTokenizer *self, void *token_re);

 lucy_RegexTokenizer*
 lucy_RegexTokenizer_init(lucy_RegexTokenizer *self,
                          const lucy_CharBuf *pattern) {
     SV *token_re_sv;

     lucy_Analyzer_init((lucy_Analyzer*)self);
     #define DEFAULT_PATTERN "\\w+(?:['\\x{2019}]\\w+)*"
     if (pattern) {
         if (Lucy_CB_Find_Str(pattern, "\\p", 2) != -1
             || Lucy_CB_Find_Str(pattern, "\\P", 2) != -1
            ) {
             LUCY_DECREF(self);
             THROW(LUCY_ERR, "\\p and \\P constructs forbidden");
         }
         self->pattern = Lucy_CB_Clone(pattern);
     }
     else {
         self->pattern = lucy_CB_new_from_trusted_utf8(
                             DEFAULT_PATTERN, sizeof(DEFAULT_PATTERN) - 1);
     }

     // Acquire a compiled regex engine for matching one token.
     token_re_sv = (SV*)lucy_Host_callback_host(
                       LUCY_REGEXTOKENIZER, "compile_token_re", 1,
                       CFISH_ARG_STR("pattern", self->pattern));
     S_set_token_re_but_not_pattern(self, SvRV(token_re_sv));
     SvREFCNT_dec(token_re_sv);

     return self;
 }

 static void
 S_set_token_re_but_not_pattern(lucy_RegexTokenizer *self, void *token_re) {
 #if (PERL_VERSION > 10)
     REGEXP *rx = SvRX((SV*)token_re);
 #else
     MAGIC *magic = NULL;
     if (SvMAGICAL((SV*)token_re)) {
         magic = mg_find((SV*)token_re, PERL_MAGIC_qr);
     }
     if (!magic) {
         THROW(LUCY_ERR, "token_re is not a qr// entity");
     }
     REGEXP *rx = (REGEXP*)magic->mg_obj;
 #endif
     if (rx == NULL) {
         THROW(LUCY_ERR, "Failed to extract REGEXP from token_re '%s'",
               SvPV_nolen((SV*)token_re));
     }
     if (self->token_re) { ReREFCNT_dec(((REGEXP*)self->token_re)); }
     self->token_re = rx;
     (void)ReREFCNT_inc(((REGEXP*)self->token_re));
 }

 static void
 S_set_pattern_from_token_re(lucy_RegexTokenizer *self, void *token_re) {
     SV *rv = newRV((SV*)token_re);
     STRLEN len = 0;
     char *ptr = SvPVutf8((SV*)rv, len);
     Lucy_CB_Mimic_Str(self->pattern, ptr, len);
     SvREFCNT_dec(rv);
 }

 void
 lucy_RegexTokenizer_set_token_re(lucy_RegexTokenizer *self, void *token_re) {
     S_set_token_re_but_not_pattern(self, token_re);
     // Set pattern as a side effect.
     S_set_pattern_from_token_re(self, token_re);
 }

 void
 lucy_RegexTokenizer_destroy(lucy_RegexTokenizer *self) {
     LUCY_DECREF(self->pattern);
     ReREFCNT_dec(((REGEXP*)self->token_re));
     LUCY_SUPER_DESTROY(self, LUCY_REGEXTOKENIZER);
 }

 void
 lucy_RegexTokenizer_tokenize_str(lucy_RegexTokenizer *self,
                                  const char *string, size_t string_len,
                                  lucy_Inversion *inversion) {
     uint32_t   num_code_points = 0;
     SV        *wrapper    = sv_newmortal();
 #if (PERL_VERSION > 10)
     REGEXP    *rx         = (REGEXP*)self->token_re;
     regexp    *rx_struct  = (regexp*)SvANY(rx);
 #else
     REGEXP    *rx         = (REGEXP*)self->token_re;
     regexp    *rx_struct  = rx;
 #endif
     char      *string_beg = (char*)string;
     char      *string_end = string_beg + string_len;
     char      *string_arg = string_beg;


     // Fake up an SV wrapper to feed to the regex engine.
     sv_upgrade(wrapper, SVt_PV);
     SvREADONLY_on(wrapper);
     SvLEN(wrapper) = 0;
     SvUTF8_on(wrapper);

     // Wrap the string in an SV to please the regex engine.
     SvPVX(wrapper) = string_beg;
     SvCUR_set(wrapper, string_len);
     SvPOK_on(wrapper);

     while (pregexec(rx, string_arg, string_end, string_arg, 1, wrapper, 1)) {
 #if ((PERL_VERSION >= 10) || (PERL_VERSION == 9 && PERL_SUBVERSION >= 5))
         char *const start_ptr = string_arg + rx_struct->offs[0].start;
         char *const end_ptr   = string_arg + rx_struct->offs[0].end;
 #else
         char *const start_ptr = string_arg + rx_struct->startp[0];
         char *const end_ptr   = string_arg + rx_struct->endp[0];
 #endif
         uint32_t start, end;

         // Get start and end offsets in Unicode code points.
         for (; string_arg < start_ptr; num_code_points++) {
             string_arg += lucy_StrHelp_UTF8_COUNT[(uint8_t)(*string_arg)];
             if (string_arg > string_end) {
                 THROW(LUCY_ERR, "scanned past end of '%s'", string_beg);
             }
         }
         start = num_code_points;
         for (; string_arg < end_ptr; num_code_points++) {
             string_arg += lucy_StrHelp_UTF8_COUNT[(uint8_t)(*string_arg)];
             if (string_arg > string_end) {
                 THROW(LUCY_ERR, "scanned past end of '%s'", string_beg);
             }
         }
         end = num_code_points;

         // Add a token to the new inversion.
         Lucy_Inversion_Append(inversion,
                               lucy_Token_new(
                                   start_ptr,
                                   (end_ptr - start_ptr),
                                   start,
                                   end,
                                   1.0f,   // boost always 1 for now
                                   1       // position increment
                               )
                              );
     }
 }
	/* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#define C_LUCY_REGEXTOKENIZER
	#define C_LUCY_TOKEN
	#include "XSBind.h"

	#include "Lucy/Analysis/RegexTokenizer.h"
	#include "Lucy/Analysis/Token.h"
	#include "Lucy/Analysis/Inversion.h"
	#include "Lucy/Object/Host.h"
	#include "Lucy/Util/Memory.h"
	#include "Lucy/Util/StringHelper.h"

	static void
	S_set_token_re_but_not_pattern(lucy_RegexTokenizer self, void token_re);

	static void
	S_set_pattern_from_token_re(lucy_RegexTokenizer self, void token_re);

	lucy_RegexTokenizer*
	lucy_RegexTokenizer_init(lucy_RegexTokenizer *self,
	const lucy_CharBuf *pattern) {
	SV *token_re_sv;

	lucy_Analyzer_init((lucy_Analyzer*)self);
	#define DEFAULT_PATTERN "\\w+(?:['\\x{2019}]\\w+)*"
	if (pattern) {
	if (Lucy_CB_Find_Str(pattern, "\\p", 2) != -1
	\|\| Lucy_CB_Find_Str(pattern, "\\P", 2) != -1
	) {
	LUCY_DECREF(self);
	THROW(LUCY_ERR, "\\p and \\P constructs forbidden");
	}
	self->pattern = Lucy_CB_Clone(pattern);
	}
	else {
	self->pattern = lucy_CB_new_from_trusted_utf8(
	DEFAULT_PATTERN, sizeof(DEFAULT_PATTERN) - 1);
	}

	// Acquire a compiled regex engine for matching one token.
	token_re_sv = (SV*)lucy_Host_callback_host(
	LUCY_REGEXTOKENIZER, "compile_token_re", 1,
	CFISH_ARG_STR("pattern", self->pattern));
	S_set_token_re_but_not_pattern(self, SvRV(token_re_sv));
	SvREFCNT_dec(token_re_sv);

	return self;
	}

	static void
	S_set_token_re_but_not_pattern(lucy_RegexTokenizer self, void token_re) {
	#if (PERL_VERSION > 10)
	REGEXP rx = SvRX((SV)token_re);
	#else
	MAGIC *magic = NULL;
	if (SvMAGICAL((SV*)token_re)) {
	magic = mg_find((SV*)token_re, PERL_MAGIC_qr);
	}
	if (!magic) {
	THROW(LUCY_ERR, "token_re is not a qr// entity");
	}
	REGEXP rx = (REGEXP)magic->mg_obj;
	#endif
	if (rx == NULL) {
	THROW(LUCY_ERR, "Failed to extract REGEXP from token_re '%s'",
	SvPV_nolen((SV*)token_re));
	}
	if (self->token_re) { ReREFCNT_dec(((REGEXP*)self->token_re)); }
	self->token_re = rx;
	(void)ReREFCNT_inc(((REGEXP*)self->token_re));
	}

	static void
	S_set_pattern_from_token_re(lucy_RegexTokenizer self, void token_re) {
	SV rv = newRV((SV)token_re);
	STRLEN len = 0;
	char ptr = SvPVutf8((SV)rv, len);
	Lucy_CB_Mimic_Str(self->pattern, ptr, len);
	SvREFCNT_dec(rv);
	}

	void
	lucy_RegexTokenizer_set_token_re(lucy_RegexTokenizer self, void token_re) {
	S_set_token_re_but_not_pattern(self, token_re);
	// Set pattern as a side effect.
	S_set_pattern_from_token_re(self, token_re);
	}

	void
	lucy_RegexTokenizer_destroy(lucy_RegexTokenizer *self) {
	LUCY_DECREF(self->pattern);
	ReREFCNT_dec(((REGEXP*)self->token_re));
	LUCY_SUPER_DESTROY(self, LUCY_REGEXTOKENIZER);
	}

	void
	lucy_RegexTokenizer_tokenize_str(lucy_RegexTokenizer *self,
	const char *string, size_t string_len,
	lucy_Inversion *inversion) {
	uint32_t num_code_points = 0;
	SV *wrapper = sv_newmortal();
	#if (PERL_VERSION > 10)
	REGEXP rx = (REGEXP)self->token_re;
	regexp rx_struct = (regexp)SvANY(rx);
	#else
	REGEXP rx = (REGEXP)self->token_re;
	regexp *rx_struct = rx;
	#endif
	char string_beg = (char)string;
	char *string_end = string_beg + string_len;
	char *string_arg = string_beg;


	// Fake up an SV wrapper to feed to the regex engine.
	sv_upgrade(wrapper, SVt_PV);
	SvREADONLY_on(wrapper);
	SvLEN(wrapper) = 0;
	SvUTF8_on(wrapper);

	// Wrap the string in an SV to please the regex engine.
	SvPVX(wrapper) = string_beg;
	SvCUR_set(wrapper, string_len);
	SvPOK_on(wrapper);

	while (pregexec(rx, string_arg, string_end, string_arg, 1, wrapper, 1)) {
	#if ((PERL_VERSION >= 10) \|\| (PERL_VERSION == 9 && PERL_SUBVERSION >= 5))
	char *const start_ptr = string_arg + rx_struct->offs[0].start;
	char *const end_ptr = string_arg + rx_struct->offs[0].end;
	#else
	char *const start_ptr = string_arg + rx_struct->startp[0];
	char *const end_ptr = string_arg + rx_struct->endp[0];
	#endif
	uint32_t start, end;

	// Get start and end offsets in Unicode code points.
	for (; string_arg < start_ptr; num_code_points++) {
	string_arg += lucy_StrHelp_UTF8_COUNT[(uint8_t)(*string_arg)];
	if (string_arg > string_end) {
	THROW(LUCY_ERR, "scanned past end of '%s'", string_beg);
	}
	}
	start = num_code_points;
	for (; string_arg < end_ptr; num_code_points++) {
	string_arg += lucy_StrHelp_UTF8_COUNT[(uint8_t)(*string_arg)];
	if (string_arg > string_end) {
	THROW(LUCY_ERR, "scanned past end of '%s'", string_beg);
	}
	}
	end = num_code_points;

	// Add a token to the new inversion.
	Lucy_Inversion_Append(inversion,
	lucy_Token_new(
	start_ptr,
	(end_ptr - start_ptr),
	start,
	end,
	1.0f, // boost always 1 for now
	1 // position increment
	)
	);
	}
	}