| /* |
| Copyright 2015 The Kubernetes Authors. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| */ |
| |
| // This tool extracts the links from types.go and .md files, visits the link and |
| // checks the status code of the response. |
| // Usage: |
| // $ linkcheck --root-dir=${ROOT} |
| |
| package main |
| |
| import ( |
| "fmt" |
| "io/ioutil" |
| "net/http" |
| "os" |
| "path/filepath" |
| "regexp" |
| "strconv" |
| "strings" |
| "time" |
| |
| "github.com/mvdan/xurls" |
| flag "github.com/spf13/pflag" |
| ) |
| |
| var ( |
| rootDir = flag.String("root-dir", "", "Root directory containing documents to be processed.") |
| fileSuffix = flag.StringSlice("file-suffix", []string{"types.go", ".md"}, "suffix of files to be checked") |
| // URLs matching the patterns in the regWhiteList won't be checked. Patterns |
| // of dummy URLs should be added to the list to avoid false alerts. Also, |
| // patterns of URLs that we don't care about can be added here to improve |
| // efficiency. |
| regWhiteList = []*regexp.Regexp{ |
| regexp.MustCompile(`https://kubernetes-site\.appspot\.com`), |
| // skip url that doesn't start with an English alphabet, e.g., URLs with IP addresses. |
| regexp.MustCompile(`https?://[^A-Za-z].*`), |
| regexp.MustCompile(`https?://localhost.*`), |
| } |
| // URLs listed in the fullURLWhiteList won't be checked. This separated from |
| // the RegWhiteList to improve efficiency. This list includes dummy URLs that |
| // are hard to be generalized by a regex, and URLs that will cause false alerts. |
| fullURLWhiteList = map[string]struct{}{ |
| "http://github.com/some/repo.git": {}, |
| // This URL returns 404 when visited by this tool, but it works fine if visited by a browser. |
| "http://stackoverflow.com/questions/ask?tags=kubernetes": {}, |
| "https://github.com/$YOUR_GITHUB_USERNAME/kubernetes.git": {}, |
| "https://github.com/$YOUR_GITHUB_USERNAME/kubernetes": {}, |
| "http://storage.googleapis.com/kubernetes-release/release/v${K8S_VERSION}/bin/darwin/amd64/kubectl": {}, |
| // It seems this server expects certain User-Agent value, it works fine with Chrome, but returns 404 if we issue a plain cURL to it. |
| "http://supervisord.org/": {}, |
| "http://kubernetes.io/vX.Y/docs": {}, |
| "http://kubernetes.io/vX.Y/docs/": {}, |
| "http://kubernetes.io/vX.Y/": {}, |
| } |
| |
| visitedURLs = map[string]struct{}{} |
| htmlpreviewReg = regexp.MustCompile(`https://htmlpreview\.github\.io/\?`) |
| httpOrhttpsReg = regexp.MustCompile(`https?.*`) |
| ) |
| |
| func newWalkFunc(invalidLink *bool, client *http.Client) filepath.WalkFunc { |
| return func(filePath string, info os.FileInfo, err error) error { |
| hasSuffix := false |
| for _, suffix := range *fileSuffix { |
| hasSuffix = hasSuffix || strings.HasSuffix(info.Name(), suffix) |
| } |
| if !hasSuffix { |
| return nil |
| } |
| |
| fileBytes, err := ioutil.ReadFile(filePath) |
| if err != nil { |
| return err |
| } |
| foundInvalid := false |
| allURLs := xurls.Strict.FindAll(fileBytes, -1) |
| fmt.Fprintf(os.Stdout, "\nChecking file %s\n", filePath) |
| URL: |
| for _, URL := range allURLs { |
| // Don't check non http/https URL |
| if !httpOrhttpsReg.Match(URL) { |
| continue |
| } |
| for _, whiteURL := range regWhiteList { |
| if whiteURL.Match(URL) { |
| continue URL |
| } |
| } |
| if _, found := fullURLWhiteList[string(URL)]; found { |
| continue |
| } |
| // remove the htmlpreview Prefix |
| processedURL := htmlpreviewReg.ReplaceAll(URL, []byte{}) |
| |
| // check if we have visited the URL. |
| if _, found := visitedURLs[string(processedURL)]; found { |
| continue |
| } |
| visitedURLs[string(processedURL)] = struct{}{} |
| |
| retry := 0 |
| const maxRetry int = 3 |
| backoff := 100 |
| for retry < maxRetry { |
| fmt.Fprintf(os.Stdout, "Visiting %s\n", string(processedURL)) |
| // Use verb HEAD to increase efficiency. However, some servers |
| // do not handle HEAD well, so we need to try a GET to avoid |
| // false alert. |
| resp, err := client.Head(string(processedURL)) |
| // URLs with mock host or mock port will cause error. If we report |
| // the error here, people need to add the mock URL to the white |
| // list every time they add a mock URL, which will be a maintenance |
| // nightmare. Hence, we decide to only report 404 to catch the |
| // cases where host and port are legit, but path is not, which |
| // is the most common mistake in our docs. |
| if err != nil { |
| break |
| } |
| if resp.StatusCode == http.StatusTooManyRequests { |
| retryAfter := resp.Header.Get("Retry-After") |
| if seconds, err := strconv.Atoi(retryAfter); err != nil { |
| backoff = seconds + 10 |
| } |
| fmt.Fprintf(os.Stderr, "Got %d visiting %s, retry after %d seconds.\n", resp.StatusCode, string(URL), backoff) |
| time.Sleep(time.Duration(backoff) * time.Second) |
| backoff *= 2 |
| retry++ |
| } else if resp.StatusCode == http.StatusNotFound { |
| // We only check for 404 error for now. 401, 403 errors are hard to handle. |
| |
| // We need to try a GET to avoid false alert. |
| resp, err = client.Get(string(processedURL)) |
| if err != nil { |
| break |
| } |
| if resp.StatusCode != http.StatusNotFound { |
| continue URL |
| } |
| |
| foundInvalid = true |
| fmt.Fprintf(os.Stderr, "Failed: in file %s, Got %d visiting %s\n", filePath, resp.StatusCode, string(URL)) |
| break |
| } else { |
| break |
| } |
| } |
| if retry == maxRetry { |
| foundInvalid = true |
| fmt.Fprintf(os.Stderr, "Failed: in file %s, still got 429 visiting %s after %d retries\n", filePath, string(URL), maxRetry) |
| } |
| } |
| if foundInvalid { |
| *invalidLink = true |
| } |
| return nil |
| } |
| } |
| |
| func main() { |
| flag.Parse() |
| |
| if *rootDir == "" { |
| flag.Usage() |
| os.Exit(2) |
| } |
| client := http.Client{ |
| Timeout: time.Duration(5 * time.Second), |
| } |
| invalidLink := false |
| if err := filepath.Walk(*rootDir, newWalkFunc(&invalidLink, &client)); err != nil { |
| fmt.Fprintf(os.Stderr, "Fail: %v.\n", err) |
| os.Exit(2) |
| } |
| if invalidLink { |
| os.Exit(1) |
| } |
| } |