packages/selector/src/text/describe-text-quote.ts - incubator-annotator - Git at Google

 /**
  * SPDX-FileCopyrightText: 2016-2021 The Apache Software Foundation
  * SPDX-License-Identifier: Apache-2.0
  * @license
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 import type { TextQuoteSelector } from '../types';
 import type { Chunk, Chunker, ChunkRange } from './chunker';
 import { chunkRangeEquals } from './chunker';
 import type { RelativeSeeker } from './seeker';
 import { TextSeeker } from './seeker';
 import { textQuoteSelectorMatcher } from '.';

 /**
  * @public
  */
 export interface DescribeTextQuoteOptions {
   /**
    * Keep prefix and suffix to the minimum that is necessary to disambiguate
    * the quote. Use only if robustness against text variations is not required.
    */
   minimalContext?: boolean;

   /**
    * Add prefix and suffix to quotes below this length, such that the total of
    * `prefix + exact + suffix` is at least this length.
    */
   minimumQuoteLength?: number;

   /**
    * When attempting to find a whitespace to make the prefix/suffix start/end
    * (resp.) at a word boundary, give up after this number of characters.
    */
   maxWordLength?: number;
 }

 /**
  * Returns a {@link TextQuoteSelector} that points at the target quote in the
  * given text.
  *
  * The selector will contain the exact target quote. In case this quote appears
  * multiple times in the text, sufficient context around the quote will be
  * included in the selector’s `prefix` and `suffix` attributes to disambiguate.
  * By default, more prefix and suffix are included than strictly required; both
  * in order to be robust against slight modifications, and in an attempt to not
  * end halfway a word (mainly for human readability).
  *
  * This is an abstract implementation of the function’s logic, which expects a
  * generic {@link Chunker} to represent the text, and a {@link ChunkRange} to
  * represent the target.
  *
  * See {@link dom.describeTextQuote} for a wrapper around this
  * implementation which applies it to the text of an HTML DOM.
  *
  * @param target - The range of characters that the selector should describe
  * @param scope - The text containing the target range; or, more accurately, a
  * function that produces {@link Chunker}s corresponding to this text.
  * @param options - Options to fine-tune the function’s behaviour.
  * @returns The {@link TextQuoteSelector} that describes `target`.
  *
  * @public
  */
 export async function describeTextQuote<TChunk extends Chunk<string>>(
   target: ChunkRange<TChunk>,
   scope: () => Chunker<TChunk>,
   options: DescribeTextQuoteOptions = {},
 ): Promise<TextQuoteSelector> {
   const {
     minimalContext = false,
     minimumQuoteLength = 0,
     maxWordLength = 50,
   } = options;

   // Create a seeker to read the target quote and the context around it.
   // TODO Possible optimisation: as it need not be an AbsoluteSeeker, a
   // different implementation could provide direct ‘jump’ access in seekToChunk
   // (the scope’s Chunker would of course also have to support this).
   const seekerAtTarget = new TextSeeker(scope());

   // Create a second seeker so that we will be able to simultaneously read
   // characters near both the target and an unintended match, if we find any.
   const seekerAtUnintendedMatch = new TextSeeker(scope());

   // Read the target’s exact text.
   seekerAtTarget.seekToChunk(target.startChunk, target.startIndex);
   const exact = seekerAtTarget.readToChunk(target.endChunk, target.endIndex);

   // Start with an empty prefix and suffix.
   let prefix = '';
   let suffix = '';

   // If the quote is below the given minimum length, add some prefix & suffix.
   const currentQuoteLength = () => prefix.length + exact.length + suffix.length;
   if (currentQuoteLength() < minimumQuoteLength) {
     // Expand the prefix, but only to reach halfway towards the desired length.
     seekerAtTarget.seekToChunk(
       target.startChunk,
       target.startIndex - prefix.length,
     );
     const length = Math.floor((minimumQuoteLength - currentQuoteLength()) / 2);
     prefix = seekerAtTarget.read(-length, false, true) + prefix;

     // If needed, expand the suffix to achieve the minimum length.
     if (currentQuoteLength() < minimumQuoteLength) {
       seekerAtTarget.seekToChunk(
         target.endChunk,
         target.endIndex + suffix.length,
       );
       const length = minimumQuoteLength - currentQuoteLength();
       suffix = suffix + seekerAtTarget.read(length, false, true);

       // We might have to expand the prefix again (if at the end of the scope).
       if (currentQuoteLength() < minimumQuoteLength) {
         seekerAtTarget.seekToChunk(
           target.startChunk,
           target.startIndex - prefix.length,
         );
         const length = minimumQuoteLength - currentQuoteLength();
         prefix = seekerAtTarget.read(-length, false, true) + prefix;
       }
     }
   }

   // Expand prefix & suffix to avoid them ending somewhere halfway in a word.
   if (!minimalContext) {
     seekerAtTarget.seekToChunk(
       target.startChunk,
       target.startIndex - prefix.length,
     );
     prefix = readUntilWhitespace(seekerAtTarget, maxWordLength, true) + prefix;
     seekerAtTarget.seekToChunk(
       target.endChunk,
       target.endIndex + suffix.length,
     );
     suffix = suffix + readUntilWhitespace(seekerAtTarget, maxWordLength, false);
   }

   // Search for matches of the quote using the current prefix and suffix. At
   // each unintended match we encounter, we extend the prefix or suffix to
   // ensure it will no longer match.
   while (true) {
     const tentativeSelector: TextQuoteSelector = {
       type: 'TextQuoteSelector',
       exact,
       prefix,
       suffix,
     };

     const matches = textQuoteSelectorMatcher(tentativeSelector)(scope());
     let nextMatch = await matches.next();

     // If this match is the intended one, no need to act.
     // XXX This test is fragile: nextMatch and target are assumed to be normalised.
     if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) {
       nextMatch = await matches.next();
     }

     // If there are no more unintended matches, our selector is unambiguous!
     if (nextMatch.done) return tentativeSelector;

     // Possible optimisation: A subsequent search could safely skip the part we
     // already processed, instead of starting from the beginning again. But we’d
     // need the matcher to start at the seeker’s position, instead of searching
     // in the whole current chunk. Then we could just seek back to just after
     // the start of the prefix: seeker.seekBy(-prefix.length + 1); (don’t forget
     // to also correct for any changes in the prefix we will make below)

     // We’ll have to add more prefix/suffix to disqualify this unintended match.
     const unintendedMatch = nextMatch.value;

     // Count how many characters we’d need as a prefix to disqualify this match.
     seekerAtTarget.seekToChunk(
       target.startChunk,
       target.startIndex - prefix.length,
     );
     seekerAtUnintendedMatch.seekToChunk(
       unintendedMatch.startChunk,
       unintendedMatch.startIndex - prefix.length,
     );
     let extraPrefix = readUntilDifferent(
       seekerAtTarget,
       seekerAtUnintendedMatch,
       true,
     );
     if (extraPrefix !== undefined && !minimalContext)
       extraPrefix =
         readUntilWhitespace(seekerAtTarget, maxWordLength, true) + extraPrefix;

     // Count how many characters we’d need as a suffix to disqualify this match.
     seekerAtTarget.seekToChunk(
       target.endChunk,
       target.endIndex + suffix.length,
     );
     seekerAtUnintendedMatch.seekToChunk(
       unintendedMatch.endChunk,
       unintendedMatch.endIndex + suffix.length,
     );
     let extraSuffix = readUntilDifferent(
       seekerAtTarget,
       seekerAtUnintendedMatch,
       false,
     );
     if (extraSuffix !== undefined && !minimalContext)
       extraSuffix =
         extraSuffix + readUntilWhitespace(seekerAtTarget, maxWordLength, false);

     if (minimalContext) {
       // Use either the prefix or suffix, whichever is shortest.
       if (
         extraPrefix !== undefined &&
         (extraSuffix === undefined || extraPrefix.length <= extraSuffix.length)
       ) {
         prefix = extraPrefix + prefix;
       } else if (extraSuffix !== undefined) {
         suffix = suffix + extraSuffix;
       } else {
         throw new Error(
           'Target cannot be disambiguated; how could that have happened‽',
         );
       }
     } else {
       // For redundancy, expand both prefix and suffix.
       if (extraPrefix !== undefined) prefix = extraPrefix + prefix;
       if (extraSuffix !== undefined) suffix = suffix + extraSuffix;
     }
   }
 }

 function readUntilDifferent(
   seeker1: RelativeSeeker,
   seeker2: RelativeSeeker,
   reverse: boolean,
 ): string | undefined {
   let result = '';
   while (true) {
     let nextCharacter: string;
     try {
       nextCharacter = seeker1.read(reverse ? -1 : 1);
     } catch (err) {
       return undefined; // Start/end of text reached: cannot expand result.
     }
     result = reverse ? nextCharacter + result : result + nextCharacter;

     // Check if the newly added character makes the result differ from the second seeker.
     let comparisonCharacter: string | undefined;
     try {
       comparisonCharacter = seeker2.read(reverse ? -1 : 1);
     } catch (err) {
       // A RangeError would merely mean seeker2 is exhausted.
       if (!(err instanceof RangeError)) throw err;
     }
     if (nextCharacter !== comparisonCharacter) return result;
   }
 }

 function readUntilWhitespace(
   seeker: RelativeSeeker,
   limit = Infinity,
   reverse = false,
 ): string {
   let result = '';
   while (result.length < limit) {
     let nextCharacter: string;
     try {
       nextCharacter = seeker.read(reverse ? -1 : 1);
     } catch (err) {
       if (!(err instanceof RangeError)) throw err;
       break; // End/start of text reached.
     }

     // Stop if we reached whitespace.
     if (isWhitespace(nextCharacter)) {
       seeker.seekBy(reverse ? 1 : -1); // ‘undo’ the last read.
       break;
     }

     result = reverse ? nextCharacter + result : result + nextCharacter;
   }
   return result;
 }

 function isWhitespace(s: string): boolean {
   return /^\s+$/.test(s);
 }
	/**
	* SPDX-FileCopyrightText: 2016-2021 The Apache Software Foundation
	* SPDX-License-Identifier: Apache-2.0
	* @license
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	import type { TextQuoteSelector } from '../types';
	import type { Chunk, Chunker, ChunkRange } from './chunker';
	import { chunkRangeEquals } from './chunker';
	import type { RelativeSeeker } from './seeker';
	import { TextSeeker } from './seeker';
	import { textQuoteSelectorMatcher } from '.';

	/**
	* @public
	*/
	export interface DescribeTextQuoteOptions {
	/**
	* Keep prefix and suffix to the minimum that is necessary to disambiguate
	* the quote. Use only if robustness against text variations is not required.
	*/
	minimalContext?: boolean;

	/**
	* Add prefix and suffix to quotes below this length, such that the total of
	* `prefix + exact + suffix` is at least this length.
	*/
	minimumQuoteLength?: number;

	/**
	* When attempting to find a whitespace to make the prefix/suffix start/end
	* (resp.) at a word boundary, give up after this number of characters.
	*/
	maxWordLength?: number;
	}

	/**
	* Returns a {@link TextQuoteSelector} that points at the target quote in the
	* given text.
	*
	* The selector will contain the exact target quote. In case this quote appears
	* multiple times in the text, sufficient context around the quote will be
	* included in the selector’s `prefix` and `suffix` attributes to disambiguate.
	* By default, more prefix and suffix are included than strictly required; both
	* in order to be robust against slight modifications, and in an attempt to not
	* end halfway a word (mainly for human readability).
	*
	* This is an abstract implementation of the function’s logic, which expects a
	* generic {@link Chunker} to represent the text, and a {@link ChunkRange} to
	* represent the target.
	*
	* See {@link dom.describeTextQuote} for a wrapper around this
	* implementation which applies it to the text of an HTML DOM.
	*
	* @param target - The range of characters that the selector should describe
	* @param scope - The text containing the target range; or, more accurately, a
	* function that produces {@link Chunker}s corresponding to this text.
	* @param options - Options to fine-tune the function’s behaviour.
	* @returns The {@link TextQuoteSelector} that describes `target`.
	*
	* @public
	*/
	export async function describeTextQuote<TChunk extends Chunk<string>>(
	target: ChunkRange<TChunk>,
	scope: () => Chunker<TChunk>,
	options: DescribeTextQuoteOptions = {},
	): Promise<TextQuoteSelector> {
	const {
	minimalContext = false,
	minimumQuoteLength = 0,
	maxWordLength = 50,
	} = options;

	// Create a seeker to read the target quote and the context around it.
	// TODO Possible optimisation: as it need not be an AbsoluteSeeker, a
	// different implementation could provide direct ‘jump’ access in seekToChunk
	// (the scope’s Chunker would of course also have to support this).
	const seekerAtTarget = new TextSeeker(scope());

	// Create a second seeker so that we will be able to simultaneously read
	// characters near both the target and an unintended match, if we find any.
	const seekerAtUnintendedMatch = new TextSeeker(scope());

	// Read the target’s exact text.
	seekerAtTarget.seekToChunk(target.startChunk, target.startIndex);
	const exact = seekerAtTarget.readToChunk(target.endChunk, target.endIndex);

	// Start with an empty prefix and suffix.
	let prefix = '';
	let suffix = '';

	// If the quote is below the given minimum length, add some prefix & suffix.
	const currentQuoteLength = () => prefix.length + exact.length + suffix.length;
	if (currentQuoteLength() < minimumQuoteLength) {
	// Expand the prefix, but only to reach halfway towards the desired length.
	seekerAtTarget.seekToChunk(
	target.startChunk,
	target.startIndex - prefix.length,
	);
	const length = Math.floor((minimumQuoteLength - currentQuoteLength()) / 2);
	prefix = seekerAtTarget.read(-length, false, true) + prefix;

	// If needed, expand the suffix to achieve the minimum length.
	if (currentQuoteLength() < minimumQuoteLength) {
	seekerAtTarget.seekToChunk(
	target.endChunk,
	target.endIndex + suffix.length,
	);
	const length = minimumQuoteLength - currentQuoteLength();
	suffix = suffix + seekerAtTarget.read(length, false, true);

	// We might have to expand the prefix again (if at the end of the scope).
	if (currentQuoteLength() < minimumQuoteLength) {
	seekerAtTarget.seekToChunk(
	target.startChunk,
	target.startIndex - prefix.length,
	);
	const length = minimumQuoteLength - currentQuoteLength();
	prefix = seekerAtTarget.read(-length, false, true) + prefix;
	}
	}
	}

	// Expand prefix & suffix to avoid them ending somewhere halfway in a word.
	if (!minimalContext) {
	seekerAtTarget.seekToChunk(
	target.startChunk,
	target.startIndex - prefix.length,
	);
	prefix = readUntilWhitespace(seekerAtTarget, maxWordLength, true) + prefix;
	seekerAtTarget.seekToChunk(
	target.endChunk,
	target.endIndex + suffix.length,
	);
	suffix = suffix + readUntilWhitespace(seekerAtTarget, maxWordLength, false);
	}

	// Search for matches of the quote using the current prefix and suffix. At
	// each unintended match we encounter, we extend the prefix or suffix to
	// ensure it will no longer match.
	while (true) {
	const tentativeSelector: TextQuoteSelector = {
	type: 'TextQuoteSelector',
	exact,
	prefix,
	suffix,
	};

	const matches = textQuoteSelectorMatcher(tentativeSelector)(scope());
	let nextMatch = await matches.next();

	// If this match is the intended one, no need to act.
	// XXX This test is fragile: nextMatch and target are assumed to be normalised.
	if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) {
	nextMatch = await matches.next();
	}

	// If there are no more unintended matches, our selector is unambiguous!
	if (nextMatch.done) return tentativeSelector;

	// Possible optimisation: A subsequent search could safely skip the part we
	// already processed, instead of starting from the beginning again. But we’d
	// need the matcher to start at the seeker’s position, instead of searching
	// in the whole current chunk. Then we could just seek back to just after
	// the start of the prefix: seeker.seekBy(-prefix.length + 1); (don’t forget
	// to also correct for any changes in the prefix we will make below)

	// We’ll have to add more prefix/suffix to disqualify this unintended match.
	const unintendedMatch = nextMatch.value;

	// Count how many characters we’d need as a prefix to disqualify this match.
	seekerAtTarget.seekToChunk(
	target.startChunk,
	target.startIndex - prefix.length,
	);
	seekerAtUnintendedMatch.seekToChunk(
	unintendedMatch.startChunk,
	unintendedMatch.startIndex - prefix.length,
	);
	let extraPrefix = readUntilDifferent(
	seekerAtTarget,
	seekerAtUnintendedMatch,
	true,
	);
	if (extraPrefix !== undefined && !minimalContext)
	extraPrefix =
	readUntilWhitespace(seekerAtTarget, maxWordLength, true) + extraPrefix;

	// Count how many characters we’d need as a suffix to disqualify this match.
	seekerAtTarget.seekToChunk(
	target.endChunk,
	target.endIndex + suffix.length,
	);
	seekerAtUnintendedMatch.seekToChunk(
	unintendedMatch.endChunk,
	unintendedMatch.endIndex + suffix.length,
	);
	let extraSuffix = readUntilDifferent(
	seekerAtTarget,
	seekerAtUnintendedMatch,
	false,
	);
	if (extraSuffix !== undefined && !minimalContext)
	extraSuffix =
	extraSuffix + readUntilWhitespace(seekerAtTarget, maxWordLength, false);

	if (minimalContext) {
	// Use either the prefix or suffix, whichever is shortest.
	if (
	extraPrefix !== undefined &&
	(extraSuffix === undefined \|\| extraPrefix.length <= extraSuffix.length)
	) {
	prefix = extraPrefix + prefix;
	} else if (extraSuffix !== undefined) {
	suffix = suffix + extraSuffix;
	} else {
	throw new Error(
	'Target cannot be disambiguated; how could that have happened‽',
	);
	}
	} else {
	// For redundancy, expand both prefix and suffix.
	if (extraPrefix !== undefined) prefix = extraPrefix + prefix;
	if (extraSuffix !== undefined) suffix = suffix + extraSuffix;
	}
	}
	}

	function readUntilDifferent(
	seeker1: RelativeSeeker,
	seeker2: RelativeSeeker,
	reverse: boolean,
	): string \| undefined {
	let result = '';
	while (true) {
	let nextCharacter: string;
	try {
	nextCharacter = seeker1.read(reverse ? -1 : 1);
	} catch (err) {
	return undefined; // Start/end of text reached: cannot expand result.
	}
	result = reverse ? nextCharacter + result : result + nextCharacter;

	// Check if the newly added character makes the result differ from the second seeker.
	let comparisonCharacter: string \| undefined;
	try {
	comparisonCharacter = seeker2.read(reverse ? -1 : 1);
	} catch (err) {
	// A RangeError would merely mean seeker2 is exhausted.
	if (!(err instanceof RangeError)) throw err;
	}
	if (nextCharacter !== comparisonCharacter) return result;
	}
	}

	function readUntilWhitespace(
	seeker: RelativeSeeker,
	limit = Infinity,
	reverse = false,
	): string {
	let result = '';
	while (result.length < limit) {
	let nextCharacter: string;
	try {
	nextCharacter = seeker.read(reverse ? -1 : 1);
	} catch (err) {
	if (!(err instanceof RangeError)) throw err;
	break; // End/start of text reached.
	}

	// Stop if we reached whitespace.
	if (isWhitespace(nextCharacter)) {
	seeker.seekBy(reverse ? 1 : -1); // ‘undo’ the last read.
	break;
	}

	result = reverse ? nextCharacter + result : result + nextCharacter;
	}
	return result;
	}

	function isWhitespace(s: string): boolean {
	return /^\s+$/.test(s);
	}