packages/selector/src/text/chunker.ts - incubator-annotator - Git at Google

 /**
  * SPDX-FileCopyrightText: 2016-2021 The Apache Software Foundation
  * SPDX-License-Identifier: Apache-2.0
  * @license
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /**
  * Represents a piece of text in any kind of ‘file’.
  *
  * Its purpose is to enable generic algorithms to deal with text content of any
  * type of ‘file’ that consists of many pieces of text (e.g. a DOM, PDF, …).
  * Each Chunk represents one piece of text ({@link Chunk.data}). An object
  * implementing this interface would typically have other attributes as well to
  * map the chunk back to its position in the file (e.g. a Text node in the DOM).
  *
  * @typeParam TData - Piece of text, typically `string`
  *
  * @public
  */
 export interface Chunk<TData> {
   /**
    * The piece of text this chunk represents.
    */
   readonly data: TData;
   equals?(otherChunk: this): boolean;
 }

 /**
  * Test two {@link Chunk}s for equality.
  *
  * Equality here means that both represent the same piece of text (i.e. at the
  * same position) in the file. It compares using the custom {@link Chunk.equals}
  * method if either chunk defines one, and falls back to checking the objects’
  * identity (i.e. `chunk1 === chunk2`).
  *
  * @public
  */
 export function chunkEquals(chunk1: Chunk<any>, chunk2: Chunk<any>): boolean {
   if (chunk1.equals) return chunk1.equals(chunk2);
   if (chunk2.equals) return chunk2.equals(chunk1);
   return chunk1 === chunk2;
 }

 /**
  * Points at a range of characters between two points inside {@link Chunk}s.
  *
  * Analogous to the DOM’s ({@link https://developer.mozilla.org/en-US/docs/Web/API/AbstractRange
  * | Abstract}){@link https://developer.mozilla.org/en-US/docs/Web/API/Range |
  * Range}. Each index expresses an offset inside the value of the corresponding
  * {@link Chunk.data}, and can equal the length of that data in order to point
  * to the position right after the chunk’s last character.
  *
  * @public
  */
 export interface ChunkRange<TChunk extends Chunk<any>> {
   startChunk: TChunk;
   startIndex: number;
   endChunk: TChunk;
   endIndex: number;
 }

 /**
  * Test two {@link ChunkRange}s for equality.
  *
  * Equality here means equality of each of their four properties (i.e.
  * {@link startChunk}, {@link startIndex},
  * {@link endChunk}, and {@link endIndex}).
  * For the `startChunk`s and `endChunk`s, this function uses the custom
  * {@link Chunk.equals} method if defined.
  *
  * Note that if the start/end of one range points at the end of a chunk, and the
  * other to the start of a subsequent chunk, they are not considered equal, even
  * though semantically they may be representing the same range of characters. To
  * test for such semantic equivalence, ensure that both inputs are normalised:
  * typically this means the range is shrunk to its narrowest equivalent, and (if
  * it is empty) positioned at its first equivalent.
  *
  * @public
  */
 export function chunkRangeEquals(
   range1: ChunkRange<any>,
   range2: ChunkRange<any>,
 ): boolean {
   return (
     chunkEquals(range1.startChunk, range2.startChunk) &&
     chunkEquals(range1.endChunk, range2.endChunk) &&
     range1.startIndex === range2.startIndex &&
     range1.endIndex === range2.endIndex
   );
 }

 /**
  * Presents the pieces of text contained in some underlying ‘file’ as a sequence
  * of {@link Chunk}s.
  *
  * Rather than presenting a list of all pieces, the `Chunker` provides methods
  * to walk through the file piece by piece. This permits implementations to read
  * and convert the file to `Chunk`s lazily.
  *
  * For those familiar with the DOM APIs, it is similar to a NodeIterator (but
  * unlike NodeIterator, it has no concept of being ‘before’ or ‘after’ a chunk).
  *
  * @typeParam TChunk - (sub)type of `Chunk` being used.
  *
  * @public
  */
 export interface Chunker<TChunk extends Chunk<any>> {
   /**
    * The chunk currently being pointed at.
    *
    * Initially, this should normally be the first chunk in the file.
    */
   readonly currentChunk: TChunk;

   /**
    * Point {@link currentChunk} at the chunk following it, and return that chunk.
    * If there are no chunks following it, keep `currentChunk` unchanged and
    * return null.
    */
   nextChunk(): TChunk | null;

   /**
    * Point {@link currentChunk} at the chunk preceding it, and return that chunk.
    * If there are no chunks preceding it, keep `currentChunk` unchanged and
    * return null.
    */
   previousChunk(): TChunk | null;

   /**
    * Test if a given `chunk` is before the {@link currentChunk|current
    * chunk}.
    *
    * Returns true if `chunk` is before `this.currentChunk`, false otherwise
    * (i.e. if `chunk` follows it or is the current chunk).
    *
    * The given `chunk` need not necessarily be obtained from the same `Chunker`,
    * but the chunkers would need to represent the same file. Otherwise behaviour
    * is unspecified (an implementation might throw or just return `false`).
    *
    * @param chunk - A chunk, typically obtained from the same `Chunker`.
    */
   precedesCurrentChunk(chunk: TChunk): boolean;
 }
	/**
	* SPDX-FileCopyrightText: 2016-2021 The Apache Software Foundation
	* SPDX-License-Identifier: Apache-2.0
	* @license
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/**
	* Represents a piece of text in any kind of ‘file’.
	*
	* Its purpose is to enable generic algorithms to deal with text content of any
	* type of ‘file’ that consists of many pieces of text (e.g. a DOM, PDF, …).
	* Each Chunk represents one piece of text ({@link Chunk.data}). An object
	* implementing this interface would typically have other attributes as well to
	* map the chunk back to its position in the file (e.g. a Text node in the DOM).
	*
	* @typeParam TData - Piece of text, typically `string`
	*
	* @public
	*/
	export interface Chunk<TData> {
	/**
	* The piece of text this chunk represents.
	*/
	readonly data: TData;
	equals?(otherChunk: this): boolean;
	}

	/**
	* Test two {@link Chunk}s for equality.
	*
	* Equality here means that both represent the same piece of text (i.e. at the
	* same position) in the file. It compares using the custom {@link Chunk.equals}
	* method if either chunk defines one, and falls back to checking the objects’
	* identity (i.e. `chunk1 === chunk2`).
	*
	* @public
	*/
	export function chunkEquals(chunk1: Chunk<any>, chunk2: Chunk<any>): boolean {
	if (chunk1.equals) return chunk1.equals(chunk2);
	if (chunk2.equals) return chunk2.equals(chunk1);
	return chunk1 === chunk2;
	}

	/**
	* Points at a range of characters between two points inside {@link Chunk}s.
	*
	* Analogous to the DOM’s ({@link https://developer.mozilla.org/en-US/docs/Web/API/AbstractRange
	* \| Abstract}){@link https://developer.mozilla.org/en-US/docs/Web/API/Range \|
	* Range}. Each index expresses an offset inside the value of the corresponding
	* {@link Chunk.data}, and can equal the length of that data in order to point
	* to the position right after the chunk’s last character.
	*
	* @public
	*/
	export interface ChunkRange<TChunk extends Chunk<any>> {
	startChunk: TChunk;
	startIndex: number;
	endChunk: TChunk;
	endIndex: number;
	}

	/**
	* Test two {@link ChunkRange}s for equality.
	*
	* Equality here means equality of each of their four properties (i.e.
	* {@link startChunk}, {@link startIndex},
	* {@link endChunk}, and {@link endIndex}).
	* For the `startChunk`s and `endChunk`s, this function uses the custom
	* {@link Chunk.equals} method if defined.
	*
	* Note that if the start/end of one range points at the end of a chunk, and the
	* other to the start of a subsequent chunk, they are not considered equal, even
	* though semantically they may be representing the same range of characters. To
	* test for such semantic equivalence, ensure that both inputs are normalised:
	* typically this means the range is shrunk to its narrowest equivalent, and (if
	* it is empty) positioned at its first equivalent.
	*
	* @public
	*/
	export function chunkRangeEquals(
	range1: ChunkRange<any>,
	range2: ChunkRange<any>,
	): boolean {
	return (
	chunkEquals(range1.startChunk, range2.startChunk) &&
	chunkEquals(range1.endChunk, range2.endChunk) &&
	range1.startIndex === range2.startIndex &&
	range1.endIndex === range2.endIndex
	);
	}

	/**
	* Presents the pieces of text contained in some underlying ‘file’ as a sequence
	* of {@link Chunk}s.
	*
	* Rather than presenting a list of all pieces, the `Chunker` provides methods
	* to walk through the file piece by piece. This permits implementations to read
	* and convert the file to `Chunk`s lazily.
	*
	* For those familiar with the DOM APIs, it is similar to a NodeIterator (but
	* unlike NodeIterator, it has no concept of being ‘before’ or ‘after’ a chunk).
	*
	* @typeParam TChunk - (sub)type of `Chunk` being used.
	*
	* @public
	*/
	export interface Chunker<TChunk extends Chunk<any>> {
	/**
	* The chunk currently being pointed at.
	*
	* Initially, this should normally be the first chunk in the file.
	*/
	readonly currentChunk: TChunk;

	/**
	* Point {@link currentChunk} at the chunk following it, and return that chunk.
	* If there are no chunks following it, keep `currentChunk` unchanged and
	* return null.
	*/
	nextChunk(): TChunk \| null;

	/**
	* Point {@link currentChunk} at the chunk preceding it, and return that chunk.
	* If there are no chunks preceding it, keep `currentChunk` unchanged and
	* return null.
	*/
	previousChunk(): TChunk \| null;

	/**
	* Test if a given `chunk` is before the {@link currentChunk\|current
	* chunk}.
	*
	* Returns true if `chunk` is before `this.currentChunk`, false otherwise
	* (i.e. if `chunk` follows it or is the current chunk).
	*
	* The given `chunk` need not necessarily be obtained from the same `Chunker`,
	* but the chunkers would need to represent the same file. Otherwise behaviour
	* is unspecified (an implementation might throw or just return `false`).
	*
	* @param chunk - A chunk, typically obtained from the same `Chunker`.
	*/
	precedesCurrentChunk(chunk: TChunk): boolean;
	}