blob: 9e690f9198eb0c31a70fb96717415030272ca4a7 [file] [log] [blame]
/**
* @license
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import seek from 'dom-seek';
import type { TextQuoteSelector } from '@annotator/selector';
import { DomScope } from '../types';
import { ownerDocument, rangeFromScope } from '../scope';
export async function describeTextQuote(
range: Range,
scope: DomScope = ownerDocument(range).documentElement,
): Promise<TextQuoteSelector> {
range = range.cloneRange();
// Take the part of the range that falls within the scope.
const scopeAsRange = rangeFromScope(scope);
if (!scopeAsRange.isPointInRange(range.startContainer, range.startOffset))
range.setStart(scopeAsRange.startContainer, scopeAsRange.startOffset);
if (!scopeAsRange.isPointInRange(range.endContainer, range.endOffset))
range.setEnd(scopeAsRange.endContainer, scopeAsRange.endOffset);
const exact = range.toString();
const result: TextQuoteSelector = { type: 'TextQuoteSelector', exact };
const { prefix, suffix } = calculateContextForDisambiguation(
range,
result,
scope,
);
result.prefix = prefix;
result.suffix = suffix;
return result;
}
function calculateContextForDisambiguation(
range: Range,
selector: TextQuoteSelector,
scope: DomScope,
): { prefix?: string; suffix?: string } {
const exactText = selector.exact;
const scopeText = rangeFromScope(scope).toString();
const targetStartIndex = getRangeTextPosition(range, scope);
const targetEndIndex = targetStartIndex + exactText.length;
// Find all matches of the text in the scope.
const stringMatches: number[] = [];
let fromIndex = 0;
while (fromIndex <= scopeText.length) {
const matchIndex = scopeText.indexOf(exactText, fromIndex);
if (matchIndex === -1) break;
stringMatches.push(matchIndex);
fromIndex = matchIndex + 1;
}
// Count for each undesired match the required prefix and suffix lengths, such that either of them
// would have invalidated the match.
const affixLengthPairs: Array<[number, number]> = [];
for (const matchStartIndex of stringMatches) {
const matchEndIndex = matchStartIndex + exactText.length;
// Skip the found match if it is the actual target.
if (matchStartIndex === targetStartIndex) continue;
// Count how many characters before & after them the false match and target have in common.
const sufficientPrefixLength = charactersNeededToBeUnique(
scopeText.substring(0, targetStartIndex),
scopeText.substring(0, matchStartIndex),
true,
);
const sufficientSuffixLength = charactersNeededToBeUnique(
scopeText.substring(targetEndIndex),
scopeText.substring(matchEndIndex),
false,
);
affixLengthPairs.push([sufficientPrefixLength, sufficientSuffixLength]);
}
// Find the prefix and suffix that would invalidate all mismatches, using the minimal characters
// for prefix and suffix combined.
const [prefixLength, suffixLength] = minimalSolution(affixLengthPairs);
const prefix = scopeText.substring(
targetStartIndex - prefixLength,
targetStartIndex,
);
const suffix = scopeText.substring(
targetEndIndex,
targetEndIndex + suffixLength,
);
return { prefix, suffix };
}
function charactersNeededToBeUnique(
target: string,
impostor: string,
reverse = false,
) {
// Count how many characters the two strings have in common.
let overlap = 0;
const charAt = (s: string, i: number) =>
reverse ? s[s.length - 1 - i] : s[overlap];
while (
overlap < target.length &&
charAt(target, overlap) === charAt(impostor, overlap)
)
overlap++;
if (overlap === target.length) return Infinity;
// (no substring of target can make it distinguishable from its impostor)
else return overlap + 1;
}
function minimalSolution(
requirements: Array<[number, number]>,
): [number, number] {
// Ensure we try solutions with an empty prefix or suffix.
requirements.push([0, 0]);
// Build all the pairs and order them by their sums.
const pairs = requirements.flatMap((l) =>
requirements.map<[number, number]>((r) => [l[0], r[1]]),
);
pairs.sort((a, b) => a[0] + a[1] - (b[0] + b[1]));
// Find the first pair that satisfies every requirement.
for (const pair of pairs) {
const [p0, p1] = pair;
if (requirements.every(([r0, r1]) => r0 <= p0 || r1 <= p1)) {
return pair;
}
}
// Return the largest pairing (unreachable).
return pairs[pairs.length - 1];
}
// Get the index of the first character of range within the text of scope.
function getRangeTextPosition(range: Range, scope: DomScope): number {
const scopeAsRange = rangeFromScope(scope);
const iter = document.createNodeIterator(
scopeAsRange.commonAncestorContainer,
NodeFilter.SHOW_TEXT,
{
acceptNode(node: Text) {
// Only reveal nodes within the range
return scopeAsRange.intersectsNode(node)
? NodeFilter.FILTER_ACCEPT
: NodeFilter.FILTER_REJECT;
},
},
);
const scopeOffset = isTextNode(scopeAsRange.startContainer)
? scopeAsRange.startOffset
: 0;
if (isTextNode(range.startContainer))
return seek(iter, range.startContainer) + range.startOffset - scopeOffset;
else return seek(iter, firstTextNodeInRange(range)) - scopeOffset;
}
function firstTextNodeInRange(range: Range): Text {
// Find the first text node inside the range.
const iter = document.createNodeIterator(
range.commonAncestorContainer,
NodeFilter.SHOW_TEXT,
{
acceptNode(node: Text) {
// Only reveal nodes within the range; and skip any empty text nodes.
return range.intersectsNode(node) && node.length > 0
? NodeFilter.FILTER_ACCEPT
: NodeFilter.FILTER_REJECT;
},
},
);
const node = iter.nextNode() as Text | null;
if (node === null) throw new Error('Range contains no text nodes');
return node;
}
function isTextNode(node: Node): node is Text {
return node.nodeType === Node.TEXT_NODE;
}