| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package htmltext |
| |
| import ( |
| "io" |
| "net/http" |
| "net/url" |
| "regexp" |
| "strings" |
| "unicode/utf8" |
| |
| "github.com/Machiel/slugify" |
| "github.com/apache/incubator-answer/pkg/checker" |
| "github.com/apache/incubator-answer/pkg/converter" |
| strip "github.com/grokify/html-strip-tags-go" |
| "github.com/mozillazg/go-pinyin" |
| ) |
| |
| // ClearText clear HTML, get the clear text |
| func ClearText(html string) (text string) { |
| if len(html) == 0 { |
| text = html |
| return |
| } |
| |
| var ( |
| re *regexp.Regexp |
| codeReg = `(?ism)<(pre)>.*<\/pre>` |
| codeRepl = "{code...}" |
| linkReg = `(?ism)<a.*?[^<]>(.*)?<\/a>` |
| linkRepl = " [$1] " |
| spaceReg = ` +` |
| spaceRepl = " " |
| ) |
| re = regexp.MustCompile(codeReg) |
| html = re.ReplaceAllString(html, codeRepl) |
| |
| re = regexp.MustCompile(linkReg) |
| html = re.ReplaceAllString(html, linkRepl) |
| |
| text = strings.NewReplacer( |
| "\n", " ", |
| "\r", " ", |
| "\t", " ", |
| ).Replace(strip.StripTags(html)) |
| |
| // replace multiple spaces to one space |
| re = regexp.MustCompile(spaceReg) |
| text = strings.TrimSpace(re.ReplaceAllString(text, spaceRepl)) |
| return |
| } |
| |
| func UrlTitle(title string) (text string) { |
| title = convertChinese(title) |
| title = clearEmoji(title) |
| title = slugify.Slugify(title) |
| title = url.QueryEscape(title) |
| title = cutLongTitle(title) |
| if len(title) == 0 { |
| title = "topic" |
| } |
| return title |
| } |
| |
| func clearEmoji(s string) string { |
| ret := "" |
| rs := []rune(s) |
| for i := 0; i < len(rs); i++ { |
| if len(string(rs[i])) != 4 { |
| ret += string(rs[i]) |
| } |
| } |
| return ret |
| } |
| |
| func convertChinese(content string) string { |
| has := checker.IsChinese(content) |
| if !has { |
| return content |
| } |
| return strings.Join(pinyin.LazyConvert(content, nil), "-") |
| } |
| |
| func cutLongTitle(title string) string { |
| if len(title) > 150 { |
| return title[0:150] |
| } |
| return title |
| } |
| |
| // FetchExcerpt return the excerpt from the HTML string |
| func FetchExcerpt(html, trimMarker string, limit int) (text string) { |
| return FetchRangedExcerpt(html, trimMarker, 0, limit) |
| } |
| |
| // findFirstMatchedWord returns the first matched word and its index |
| func findFirstMatchedWord(text string, words []string) (string, int) { |
| if len(text) == 0 || len(words) == 0 { |
| return "", 0 |
| } |
| |
| words = converter.UniqueArray(words) |
| firstWord := "" |
| firstIndex := len(text) |
| |
| for _, word := range words { |
| if idx := strings.Index(text, word); idx != -1 && idx < firstIndex { |
| firstIndex = idx |
| firstWord = word |
| } |
| } |
| |
| if firstIndex != len(text) { |
| return firstWord, firstIndex |
| } |
| |
| return "", 0 |
| } |
| |
| // getRuneRange returns the valid begin and end indexes of the runeText |
| func getRuneRange(runeText []rune, offset, limit int) (begin, end int) { |
| runeLen := len(runeText) |
| |
| limit = min(runeLen, max(0, limit)) |
| begin = min(runeLen, max(0, offset)) |
| end = min(runeLen, begin+limit) |
| |
| return |
| } |
| |
| // FetchRangedExcerpt returns a ranged excerpt from the HTML string. |
| // Note: offset is a rune index, not a byte index |
| func FetchRangedExcerpt(html, trimMarker string, offset int, limit int) (text string) { |
| if len(html) == 0 { |
| text = html |
| return |
| } |
| |
| runeText := []rune(ClearText(html)) |
| begin, end := getRuneRange(runeText, offset, limit) |
| text = string(runeText[begin:end]) |
| |
| if begin > 0 { |
| text = trimMarker + text |
| } |
| if end < len(runeText) { |
| text = text + trimMarker |
| } |
| |
| return |
| } |
| |
| // FetchMatchedExcerpt returns the matched excerpt according to the words |
| func FetchMatchedExcerpt(html string, words []string, trimMarker string, trimLength int) string { |
| text := ClearText(html) |
| matchedWord, matchedIndex := findFirstMatchedWord(text, words) |
| runeIndex := utf8.RuneCountInString(text[0:matchedIndex]) |
| |
| trimLength = max(0, trimLength) |
| runeOffset := runeIndex - trimLength |
| runeLimit := trimLength + trimLength + utf8.RuneCountInString(matchedWord) |
| |
| textRuneCount := utf8.RuneCountInString(text) |
| if runeOffset+runeLimit > textRuneCount { |
| // Reserved extra chars before the matched word |
| runeOffset = textRuneCount - runeLimit |
| } |
| |
| return FetchRangedExcerpt(html, trimMarker, runeOffset, runeLimit) |
| } |
| |
| func GetPicByUrl(Url string) string { |
| res, err := http.Get(Url) |
| if err != nil { |
| return "" |
| } |
| defer res.Body.Close() |
| pix, err := io.ReadAll(res.Body) |
| if err != nil { |
| return "" |
| } |
| return string(pix) |
| } |