blob: c7a75ecb2c1df5713f9853ab44058c9d008644d2 [file] [log] [blame]
/**
* SPDX-FileCopyrightText: 2016-2020 The Apache Software Foundation
* SPDX-License-Identifier: Apache-2.0
* @license
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import type { TextQuoteSelector } from '../types';
import type { Chunk, Chunker, ChunkRange } from './chunker';
export function textQuoteSelectorMatcher(
selector: TextQuoteSelector,
): <TChunk extends Chunk<any>>(
scope: Chunker<TChunk>,
) => AsyncGenerator<ChunkRange<TChunk>, void, void> {
return async function* matchAll<TChunk extends Chunk<string>>(
textChunks: Chunker<TChunk>,
) {
const exact = selector.exact;
const prefix = selector.prefix || '';
const suffix = selector.suffix || '';
const searchPattern = prefix + exact + suffix;
// The code below runs a loop with three steps:
// 1. Continue checking any partial matches from the previous chunk(s).
// 2. Try find the whole pattern in the chunk (possibly multiple times).
// 3. Check if this chunk ends with a partial match (or even multiple partial matches).
interface PartialMatch {
startChunk?: TChunk;
startIndex?: number;
endChunk?: TChunk;
endIndex?: number;
charactersMatched: number;
}
let partialMatches: PartialMatch[] = [];
let isFirstChunk = true;
do {
const chunk = textChunks.currentChunk;
const chunkValue = chunk.data;
// 1. Continue checking any partial matches from the previous chunk(s).
const remainingPartialMatches: typeof partialMatches = [];
for (const partialMatch of partialMatches) {
const charactersMatched = partialMatch.charactersMatched;
// If the current chunk contains the start and/or end of the match, record these.
if (partialMatch.endChunk === undefined) {
const charactersUntilMatchEnd =
prefix.length + exact.length - charactersMatched;
if (charactersUntilMatchEnd <= chunkValue.length) {
partialMatch.endChunk = chunk;
partialMatch.endIndex = charactersUntilMatchEnd;
}
}
if (partialMatch.startChunk === undefined) {
const charactersUntilMatchStart = prefix.length - charactersMatched;
if (
charactersUntilMatchStart < chunkValue.length ||
partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk.
) {
partialMatch.startChunk = chunk;
partialMatch.startIndex = charactersUntilMatchStart;
}
}
const charactersUntilSuffixEnd =
searchPattern.length - charactersMatched;
if (charactersUntilSuffixEnd <= chunkValue.length) {
if (
chunkValue.startsWith(searchPattern.substring(charactersMatched))
) {
yield partialMatch as ChunkRange<TChunk>; // all fields are certainly defined now.
}
} else if (
chunkValue ===
searchPattern.substring(
charactersMatched,
charactersMatched + chunkValue.length,
)
) {
// The chunk is too short to complete the match; comparison has to be completed in subsequent chunks.
partialMatch.charactersMatched += chunkValue.length;
remainingPartialMatches.push(partialMatch);
}
}
partialMatches = remainingPartialMatches;
// 2. Try find the whole pattern in the chunk (possibly multiple times).
if (searchPattern.length <= chunkValue.length) {
let fromIndex = 0;
while (fromIndex <= chunkValue.length) {
const patternStartIndex = chunkValue.indexOf(
searchPattern,
fromIndex,
);
if (patternStartIndex === -1) break;
fromIndex = patternStartIndex + 1;
// Handle edge case: an empty searchPattern would already have been yielded at the end of the last chunk.
if (
patternStartIndex === 0 &&
searchPattern.length === 0 &&
!isFirstChunk
)
continue;
yield {
startChunk: chunk,
startIndex: patternStartIndex + prefix.length,
endChunk: chunk,
endIndex: patternStartIndex + prefix.length + exact.length,
};
}
}
// 3. Check if this chunk ends with a partial match (or even multiple partial matches).
let newPartialMatches: number[] = [];
const searchStartPoint = Math.max(
chunkValue.length - searchPattern.length + 1,
0,
);
for (let i = searchStartPoint; i < chunkValue.length; i++) {
const character = chunkValue[i];
newPartialMatches = newPartialMatches.filter(
(partialMatchStartIndex) =>
character === searchPattern[i - partialMatchStartIndex],
);
if (character === searchPattern[0]) newPartialMatches.push(i);
}
for (const partialMatchStartIndex of newPartialMatches) {
const charactersMatched = chunkValue.length - partialMatchStartIndex;
const partialMatch: PartialMatch = {
charactersMatched,
};
if (charactersMatched >= prefix.length + exact.length) {
partialMatch.endChunk = chunk;
partialMatch.endIndex =
partialMatchStartIndex + prefix.length + exact.length;
}
if (
charactersMatched > prefix.length ||
partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk.
) {
partialMatch.startChunk = chunk;
partialMatch.startIndex = partialMatchStartIndex + prefix.length;
}
partialMatches.push(partialMatch);
}
isFirstChunk = false;
} while (textChunks.nextChunk() !== null);
};
}