| /** |
| * SPDX-FileCopyrightText: 2016-2020 The Apache Software Foundation |
| * SPDX-License-Identifier: Apache-2.0 |
| * @license |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| import type { Chunk } from './chunker'; |
| import type { Seeker } from './seeker'; |
| |
| export class CodePointSeeker<TChunk extends Chunk<string>> |
| implements Seeker<TChunk, string[]> { |
| position = 0; |
| |
| constructor(public readonly raw: Seeker<TChunk>) {} |
| |
| seekBy(length: number): void { |
| this.seekTo(this.position + length); |
| } |
| |
| seekTo(target: number): void { |
| this._readOrSeekTo(false, target); |
| } |
| |
| read(length: number, roundUp?: boolean): string[] { |
| return this.readTo(this.position + length, roundUp); |
| } |
| |
| readTo(target: number, roundUp?: boolean): string[] { |
| return this._readOrSeekTo(true, target, roundUp); |
| } |
| |
| get currentChunk(): TChunk { |
| return this.raw.currentChunk; |
| } |
| |
| get offsetInChunk(): number { |
| return this.raw.offsetInChunk; |
| } |
| |
| seekToChunk(target: TChunk, offset = 0): void { |
| this._readOrSeekToChunk(false, target, offset); |
| } |
| |
| readToChunk(target: TChunk, offset = 0): string[] { |
| return this._readOrSeekToChunk(true, target, offset); |
| } |
| |
| private _readOrSeekToChunk( |
| read: true, |
| target: TChunk, |
| offset?: number, |
| ): string[]; |
| private _readOrSeekToChunk( |
| read: false, |
| target: TChunk, |
| offset?: number, |
| ): void; |
| private _readOrSeekToChunk(read: boolean, target: TChunk, offset = 0) { |
| const oldRawPosition = this.raw.position; |
| |
| let s = this.raw.readToChunk(target, offset); |
| |
| const movedForward = this.raw.position >= oldRawPosition; |
| |
| if (movedForward && endsWithinCharacter(s)) { |
| this.raw.seekBy(-1); |
| s = s.slice(0, -1); |
| } else if (!movedForward && startsWithinCharacter(s)) { |
| this.raw.seekBy(1); |
| s = s.slice(1); |
| } |
| |
| const result = [...s]; |
| |
| this.position = movedForward |
| ? this.position + result.length |
| : this.position - result.length; |
| |
| if (read) return result; |
| } |
| |
| private _readOrSeekTo( |
| read: true, |
| target: number, |
| roundUp?: boolean, |
| ): string[]; |
| private _readOrSeekTo(read: false, target: number, roundUp?: boolean): void; |
| private _readOrSeekTo( |
| read: boolean, |
| target: number, |
| roundUp = false, |
| ): string[] | void { |
| let result: string[] = []; |
| |
| if (this.position < target) { |
| let unpairedSurrogate = ''; |
| let characters: string[] = []; |
| while (this.position < target) { |
| let s = unpairedSurrogate + this.raw.read(1, true); |
| if (endsWithinCharacter(s)) { |
| unpairedSurrogate = s.slice(-1); // consider this half-character part of the next string. |
| s = s.slice(0, -1); |
| } else { |
| unpairedSurrogate = ''; |
| } |
| characters = [...s]; |
| this.position += characters.length; |
| if (read) result = result.concat(characters); |
| } |
| if (unpairedSurrogate) this.raw.seekBy(-1); // align with the last complete character. |
| if (!roundUp && this.position > target) { |
| const overshootInCodePoints = this.position - target; |
| const overshootInCodeUnits = characters |
| .slice(-overshootInCodePoints) |
| .join('').length; |
| this.position -= overshootInCodePoints; |
| this.raw.seekBy(-overshootInCodeUnits); |
| } |
| } else { |
| // Nearly equal to the if-block, but moving backward in the text. |
| let unpairedSurrogate = ''; |
| let characters: string[] = []; |
| while (this.position > target) { |
| let s = this.raw.read(-1, true) + unpairedSurrogate; |
| if (startsWithinCharacter(s)) { |
| unpairedSurrogate = s[0]; |
| s = s.slice(1); |
| } else { |
| unpairedSurrogate = ''; |
| } |
| characters = [...s]; |
| this.position -= characters.length; |
| if (read) result = characters.concat(result); |
| } |
| if (unpairedSurrogate) this.raw.seekBy(1); |
| if (!roundUp && this.position < target) { |
| const overshootInCodePoints = target - this.position; |
| const overshootInCodeUnits = characters |
| .slice(0, overshootInCodePoints) |
| .join('').length; |
| this.position += overshootInCodePoints; |
| this.raw.seekBy(overshootInCodeUnits); |
| } |
| } |
| |
| if (read) return result; |
| } |
| } |
| |
| function endsWithinCharacter(s: string) { |
| const codeUnit = s.charCodeAt(s.length - 1); |
| return 0xd800 <= codeUnit && codeUnit <= 0xdbff; |
| } |
| |
| function startsWithinCharacter(s: string) { |
| const codeUnit = s.charCodeAt(0); |
| return 0xdc00 <= codeUnit && codeUnit <= 0xdfff; |
| } |