| /** |
| * SPDX-FileCopyrightText: 2016-2020 The Apache Software Foundation |
| * SPDX-License-Identifier: Apache-2.0 |
| * @license |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| import type { TextQuoteSelector } from '../types'; |
| import type { Chunk, Chunker, ChunkRange } from './chunker'; |
| |
| /** |
| * Find occurrences in a text matching the given {@link TextQuoteSelector}. |
| * |
| * This performs an exact search the selector’s quote (including prefix and |
| * suffix) within the given text. |
| * |
| * Note the match is based on strict character-by-character equivalence, i.e. |
| * it is sensitive to whitespace, capitalisation, etc. |
| * |
| * This is an abstract implementation of the function’s logic, which expects a |
| * generic {@link Chunker} to represent the text, and returns an (async) |
| * generator of {@link ChunkRange}s to represent the matches. |
| * |
| * See {@link dom.createTextQuoteSelectorMatcher} for a |
| * wrapper around this implementation which applies it to the text of an HTML |
| * DOM. |
| * |
| * The function is curried, taking first the selector and then the text. |
| * |
| * As there may be multiple matches for a given selector (when its prefix and |
| * suffix attributes are not sufficient to disambiguate it), the matcher will |
| * return an (async) generator that produces each match in the order they are |
| * found in the text. |
| * |
| * *XXX Modifying the Chunks while the search is still running can mess up and |
| * result in an error or an infinite loop. See [issue #112](https://github.com/apache/incubator-annotator/issues/112).* |
| * |
| * @example |
| * ``` |
| * const selector = { type: 'TextQuoteSelector', exact: 'banana' }; |
| * const matches = textQuoteSelectorMatcher(selector)(textChunks); |
| * for await (match of matches) console.log(match); |
| * // ⇒ { startChunk: { … }, startIndex: 187, endChunk: { … }, endIndex: 193 } |
| * // ⇒ { startChunk: { … }, startIndex: 631, endChunk: { … }, endIndex: 637 } |
| * ``` |
| * |
| * @param selector - The {@link TextQuoteSelector} to be anchored |
| * @returns a {@link Matcher} function that applies `selector` to a given text |
| * |
| * @public |
| */ |
| export function textQuoteSelectorMatcher( |
| selector: TextQuoteSelector, |
| ): <TChunk extends Chunk<any>>( |
| scope: Chunker<TChunk>, |
| ) => AsyncGenerator<ChunkRange<TChunk>, void, void> { |
| return async function* matchAll<TChunk extends Chunk<string>>( |
| textChunks: Chunker<TChunk>, |
| ) { |
| const exact = selector.exact; |
| const prefix = selector.prefix || ''; |
| const suffix = selector.suffix || ''; |
| const searchPattern = prefix + exact + suffix; |
| |
| // The code below essentially just performs string.indexOf(searchPattern), |
| // but on a string that is chopped up in multiple chunks. It runs a loop |
| // containing three steps: |
| // 1. Continue checking any partial matches from the previous chunk(s). |
| // 2. Try find the whole pattern in the chunk (possibly multiple times). |
| // 3. Check if this chunk ends with a partial match (or even multiple partial matches). |
| |
| interface PartialMatch { |
| startChunk?: TChunk; |
| startIndex?: number; |
| endChunk?: TChunk; |
| endIndex?: number; |
| charactersMatched: number; |
| } |
| let partialMatches: PartialMatch[] = []; |
| |
| let isFirstChunk = true; |
| do { |
| const chunk = textChunks.currentChunk; |
| const chunkValue = chunk.data; |
| |
| // 1. Continue checking any partial matches from the previous chunk(s). |
| const remainingPartialMatches: typeof partialMatches = []; |
| for (const partialMatch of partialMatches) { |
| const charactersMatched = partialMatch.charactersMatched; |
| |
| // If the current chunk contains the start and/or end of the match, record these. |
| if (partialMatch.endChunk === undefined) { |
| const charactersUntilMatchEnd = |
| prefix.length + exact.length - charactersMatched; |
| if (charactersUntilMatchEnd <= chunkValue.length) { |
| partialMatch.endChunk = chunk; |
| partialMatch.endIndex = charactersUntilMatchEnd; |
| } |
| } |
| if (partialMatch.startChunk === undefined) { |
| const charactersUntilMatchStart = prefix.length - charactersMatched; |
| if ( |
| charactersUntilMatchStart < chunkValue.length || |
| partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk. |
| ) { |
| partialMatch.startChunk = chunk; |
| partialMatch.startIndex = charactersUntilMatchStart; |
| } |
| } |
| |
| const charactersUntilSuffixEnd = |
| searchPattern.length - charactersMatched; |
| if (charactersUntilSuffixEnd <= chunkValue.length) { |
| if ( |
| chunkValue.startsWith(searchPattern.substring(charactersMatched)) |
| ) { |
| yield partialMatch as ChunkRange<TChunk>; // all fields are certainly defined now. |
| } |
| } else if ( |
| chunkValue === |
| searchPattern.substring( |
| charactersMatched, |
| charactersMatched + chunkValue.length, |
| ) |
| ) { |
| // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks. |
| partialMatch.charactersMatched += chunkValue.length; |
| remainingPartialMatches.push(partialMatch); |
| } |
| } |
| partialMatches = remainingPartialMatches; |
| |
| // 2. Try find the whole pattern in the chunk (possibly multiple times). |
| if (searchPattern.length <= chunkValue.length) { |
| let fromIndex = 0; |
| while (fromIndex <= chunkValue.length) { |
| const patternStartIndex = chunkValue.indexOf( |
| searchPattern, |
| fromIndex, |
| ); |
| if (patternStartIndex === -1) break; |
| fromIndex = patternStartIndex + 1; |
| |
| // Handle edge case: an empty searchPattern would already have been yielded at the end of the last chunk. |
| if ( |
| patternStartIndex === 0 && |
| searchPattern.length === 0 && |
| !isFirstChunk |
| ) |
| continue; |
| |
| yield { |
| startChunk: chunk, |
| startIndex: patternStartIndex + prefix.length, |
| endChunk: chunk, |
| endIndex: patternStartIndex + prefix.length + exact.length, |
| }; |
| } |
| } |
| |
| // 3. Check if this chunk ends with a partial match (or even multiple partial matches). |
| let newPartialMatches: number[] = []; |
| const searchStartPoint = Math.max( |
| chunkValue.length - searchPattern.length + 1, |
| 0, |
| ); |
| for (let i = searchStartPoint; i < chunkValue.length; i++) { |
| const character = chunkValue[i]; |
| newPartialMatches = newPartialMatches.filter( |
| (partialMatchStartIndex) => |
| character === searchPattern[i - partialMatchStartIndex], |
| ); |
| if (character === searchPattern[0]) newPartialMatches.push(i); |
| } |
| for (const partialMatchStartIndex of newPartialMatches) { |
| const charactersMatched = chunkValue.length - partialMatchStartIndex; |
| const partialMatch: PartialMatch = { |
| charactersMatched, |
| }; |
| if (charactersMatched >= prefix.length + exact.length) { |
| partialMatch.endChunk = chunk; |
| partialMatch.endIndex = |
| partialMatchStartIndex + prefix.length + exact.length; |
| } |
| if ( |
| charactersMatched > prefix.length || |
| partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk. |
| ) { |
| partialMatch.startChunk = chunk; |
| partialMatch.startIndex = partialMatchStartIndex + prefix.length; |
| } |
| partialMatches.push(partialMatch); |
| } |
| |
| isFirstChunk = false; |
| } while (textChunks.nextChunk() !== null); |
| }; |
| } |