blob: 645e1b76f7945b390bd453f49b03efc297e9e5a8 [file] [log] [blame]
/*
Package purell offers URL normalization as described on the wikipedia page:
http://en.wikipedia.org/wiki/URL_normalization
*/
package purell
import (
"bytes"
"fmt"
"net/url"
"regexp"
"sort"
"strconv"
"strings"
"github.com/PuerkitoBio/urlesc"
"golang.org/x/net/idna"
"golang.org/x/text/unicode/norm"
"golang.org/x/text/width"
)
// A set of normalization flags determines how a URL will
// be normalized.
type NormalizationFlags uint
const (
// Safe normalizations
FlagLowercaseScheme NormalizationFlags = 1 << iota // HTTP://host -> http://host, applied by default in Go1.1
FlagLowercaseHost // http://HOST -> http://host
FlagUppercaseEscapes // http://host/t%ef -> http://host/t%EF
FlagDecodeUnnecessaryEscapes // http://host/t%41 -> http://host/tA
FlagEncodeNecessaryEscapes // http://host/!"#$ -> http://host/%21%22#$
FlagRemoveDefaultPort // http://host:80 -> http://host
FlagRemoveEmptyQuerySeparator // http://host/path? -> http://host/path
// Usually safe normalizations
FlagRemoveTrailingSlash // http://host/path/ -> http://host/path
FlagAddTrailingSlash // http://host/path -> http://host/path/ (should choose only one of these add/remove trailing slash flags)
FlagRemoveDotSegments // http://host/path/./a/b/../c -> http://host/path/a/c
// Unsafe normalizations
FlagRemoveDirectoryIndex // http://host/path/index.html -> http://host/path/
FlagRemoveFragment // http://host/path#fragment -> http://host/path
FlagForceHTTP // https://host -> http://host
FlagRemoveDuplicateSlashes // http://host/path//a///b -> http://host/path/a/b
FlagRemoveWWW // http://www.host/ -> http://host/
FlagAddWWW // http://host/ -> http://www.host/ (should choose only one of these add/remove WWW flags)
FlagSortQuery // http://host/path?c=3&b=2&a=1&b=1 -> http://host/path?a=1&b=1&b=2&c=3
// Normalizations not in the wikipedia article, required to cover tests cases
// submitted by jehiah
FlagDecodeDWORDHost // http://1113982867 -> http://66.102.7.147
FlagDecodeOctalHost // http://0102.0146.07.0223 -> http://66.102.7.147
FlagDecodeHexHost // http://0x42660793 -> http://66.102.7.147
FlagRemoveUnnecessaryHostDots // http://.host../path -> http://host/path
FlagRemoveEmptyPortSeparator // http://host:/path -> http://host/path
// Convenience set of safe normalizations
FlagsSafe NormalizationFlags = FlagLowercaseHost | FlagLowercaseScheme | FlagUppercaseEscapes | FlagDecodeUnnecessaryEscapes | FlagEncodeNecessaryEscapes | FlagRemoveDefaultPort | FlagRemoveEmptyQuerySeparator
// For convenience sets, "greedy" uses the "remove trailing slash" and "remove www. prefix" flags,
// while "non-greedy" uses the "add (or keep) the trailing slash" and "add www. prefix".
// Convenience set of usually safe normalizations (includes FlagsSafe)
FlagsUsuallySafeGreedy NormalizationFlags = FlagsSafe | FlagRemoveTrailingSlash | FlagRemoveDotSegments
FlagsUsuallySafeNonGreedy NormalizationFlags = FlagsSafe | FlagAddTrailingSlash | FlagRemoveDotSegments
// Convenience set of unsafe normalizations (includes FlagsUsuallySafe)
FlagsUnsafeGreedy NormalizationFlags = FlagsUsuallySafeGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagRemoveWWW | FlagSortQuery
FlagsUnsafeNonGreedy NormalizationFlags = FlagsUsuallySafeNonGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagAddWWW | FlagSortQuery
// Convenience set of all available flags
FlagsAllGreedy = FlagsUnsafeGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
FlagsAllNonGreedy = FlagsUnsafeNonGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
)
const (
defaultHttpPort = ":80"
defaultHttpsPort = ":443"
)
// Regular expressions used by the normalizations
var rxPort = regexp.MustCompile(`(:\d+)/?$`)
var rxDirIndex = regexp.MustCompile(`(^|/)((?:default|index)\.\w{1,4})$`)
var rxDupSlashes = regexp.MustCompile(`/{2,}`)
var rxDWORDHost = regexp.MustCompile(`^(\d+)((?:\.+)?(?:\:\d*)?)$`)
var rxOctalHost = regexp.MustCompile(`^(0\d*)\.(0\d*)\.(0\d*)\.(0\d*)((?:\.+)?(?:\:\d*)?)$`)
var rxHexHost = regexp.MustCompile(`^0x([0-9A-Fa-f]+)((?:\.+)?(?:\:\d*)?)$`)
var rxHostDots = regexp.MustCompile(`^(.+?)(:\d+)?$`)
var rxEmptyPort = regexp.MustCompile(`:+$`)
// Map of flags to implementation function.
// FlagDecodeUnnecessaryEscapes has no action, since it is done automatically
// by parsing the string as an URL. Same for FlagUppercaseEscapes and FlagRemoveEmptyQuerySeparator.
// Since maps have undefined traversing order, make a slice of ordered keys
var flagsOrder = []NormalizationFlags{
FlagLowercaseScheme,
FlagLowercaseHost,
FlagRemoveDefaultPort,
FlagRemoveDirectoryIndex,
FlagRemoveDotSegments,
FlagRemoveFragment,
FlagForceHTTP, // Must be after remove default port (because https=443/http=80)
FlagRemoveDuplicateSlashes,
FlagRemoveWWW,
FlagAddWWW,
FlagSortQuery,
FlagDecodeDWORDHost,
FlagDecodeOctalHost,
FlagDecodeHexHost,
FlagRemoveUnnecessaryHostDots,
FlagRemoveEmptyPortSeparator,
FlagRemoveTrailingSlash, // These two (add/remove trailing slash) must be last
FlagAddTrailingSlash,
}
// ... and then the map, where order is unimportant
var flags = map[NormalizationFlags]func(*url.URL){
FlagLowercaseScheme: lowercaseScheme,
FlagLowercaseHost: lowercaseHost,
FlagRemoveDefaultPort: removeDefaultPort,
FlagRemoveDirectoryIndex: removeDirectoryIndex,
FlagRemoveDotSegments: removeDotSegments,
FlagRemoveFragment: removeFragment,
FlagForceHTTP: forceHTTP,
FlagRemoveDuplicateSlashes: removeDuplicateSlashes,
FlagRemoveWWW: removeWWW,
FlagAddWWW: addWWW,
FlagSortQuery: sortQuery,
FlagDecodeDWORDHost: decodeDWORDHost,
FlagDecodeOctalHost: decodeOctalHost,
FlagDecodeHexHost: decodeHexHost,
FlagRemoveUnnecessaryHostDots: removeUnncessaryHostDots,
FlagRemoveEmptyPortSeparator: removeEmptyPortSeparator,
FlagRemoveTrailingSlash: removeTrailingSlash,
FlagAddTrailingSlash: addTrailingSlash,
}
// MustNormalizeURLString returns the normalized string, and panics if an error occurs.
// It takes an URL string as input, as well as the normalization flags.
func MustNormalizeURLString(u string, f NormalizationFlags) string {
result, e := NormalizeURLString(u, f)
if e != nil {
panic(e)
}
return result
}
// NormalizeURLString returns the normalized string, or an error if it can't be parsed into an URL object.
// It takes an URL string as input, as well as the normalization flags.
func NormalizeURLString(u string, f NormalizationFlags) (string, error) {
parsed, err := url.Parse(u)
if err != nil {
return "", err
}
if f&FlagLowercaseHost == FlagLowercaseHost {
parsed.Host = strings.ToLower(parsed.Host)
}
// The idna package doesn't fully conform to RFC 5895
// (https://tools.ietf.org/html/rfc5895), so we do it here.
// Taken from Go 1.8 cycle source, courtesy of bradfitz.
// TODO: Remove when (if?) idna package conforms to RFC 5895.
parsed.Host = width.Fold.String(parsed.Host)
parsed.Host = norm.NFC.String(parsed.Host)
if parsed.Host, err = idna.ToASCII(parsed.Host); err != nil {
return "", err
}
return NormalizeURL(parsed, f), nil
}
// NormalizeURL returns the normalized string.
// It takes a parsed URL object as input, as well as the normalization flags.
func NormalizeURL(u *url.URL, f NormalizationFlags) string {
for _, k := range flagsOrder {
if f&k == k {
flags[k](u)
}
}
return urlesc.Escape(u)
}
func lowercaseScheme(u *url.URL) {
if len(u.Scheme) > 0 {
u.Scheme = strings.ToLower(u.Scheme)
}
}
func lowercaseHost(u *url.URL) {
if len(u.Host) > 0 {
u.Host = strings.ToLower(u.Host)
}
}
func removeDefaultPort(u *url.URL) {
if len(u.Host) > 0 {
scheme := strings.ToLower(u.Scheme)
u.Host = rxPort.ReplaceAllStringFunc(u.Host, func(val string) string {
if (scheme == "http" && val == defaultHttpPort) || (scheme == "https" && val == defaultHttpsPort) {
return ""
}
return val
})
}
}
func removeTrailingSlash(u *url.URL) {
if l := len(u.Path); l > 0 {
if strings.HasSuffix(u.Path, "/") {
u.Path = u.Path[:l-1]
}
} else if l = len(u.Host); l > 0 {
if strings.HasSuffix(u.Host, "/") {
u.Host = u.Host[:l-1]
}
}
}
func addTrailingSlash(u *url.URL) {
if l := len(u.Path); l > 0 {
if !strings.HasSuffix(u.Path, "/") {
u.Path += "/"
}
} else if l = len(u.Host); l > 0 {
if !strings.HasSuffix(u.Host, "/") {
u.Host += "/"
}
}
}
func removeDotSegments(u *url.URL) {
if len(u.Path) > 0 {
var dotFree []string
var lastIsDot bool
sections := strings.Split(u.Path, "/")
for _, s := range sections {
if s == ".." {
if len(dotFree) > 0 {
dotFree = dotFree[:len(dotFree)-1]
}
} else if s != "." {
dotFree = append(dotFree, s)
}
lastIsDot = (s == "." || s == "..")
}
// Special case if host does not end with / and new path does not begin with /
u.Path = strings.Join(dotFree, "/")
if u.Host != "" && !strings.HasSuffix(u.Host, "/") && !strings.HasPrefix(u.Path, "/") {
u.Path = "/" + u.Path
}
// Special case if the last segment was a dot, make sure the path ends with a slash
if lastIsDot && !strings.HasSuffix(u.Path, "/") {
u.Path += "/"
}
}
}
func removeDirectoryIndex(u *url.URL) {
if len(u.Path) > 0 {
u.Path = rxDirIndex.ReplaceAllString(u.Path, "$1")
}
}
func removeFragment(u *url.URL) {
u.Fragment = ""
}
func forceHTTP(u *url.URL) {
if strings.ToLower(u.Scheme) == "https" {
u.Scheme = "http"
}
}
func removeDuplicateSlashes(u *url.URL) {
if len(u.Path) > 0 {
u.Path = rxDupSlashes.ReplaceAllString(u.Path, "/")
}
}
func removeWWW(u *url.URL) {
if len(u.Host) > 0 && strings.HasPrefix(strings.ToLower(u.Host), "www.") {
u.Host = u.Host[4:]
}
}
func addWWW(u *url.URL) {
if len(u.Host) > 0 && !strings.HasPrefix(strings.ToLower(u.Host), "www.") {
u.Host = "www." + u.Host
}
}
func sortQuery(u *url.URL) {
q := u.Query()
if len(q) > 0 {
arKeys := make([]string, len(q))
i := 0
for k, _ := range q {
arKeys[i] = k
i++
}
sort.Strings(arKeys)
buf := new(bytes.Buffer)
for _, k := range arKeys {
sort.Strings(q[k])
for _, v := range q[k] {
if buf.Len() > 0 {
buf.WriteRune('&')
}
buf.WriteString(fmt.Sprintf("%s=%s", k, urlesc.QueryEscape(v)))
}
}
// Rebuild the raw query string
u.RawQuery = buf.String()
}
}
func decodeDWORDHost(u *url.URL) {
if len(u.Host) > 0 {
if matches := rxDWORDHost.FindStringSubmatch(u.Host); len(matches) > 2 {
var parts [4]int64
dword, _ := strconv.ParseInt(matches[1], 10, 0)
for i, shift := range []uint{24, 16, 8, 0} {
parts[i] = dword >> shift & 0xFF
}
u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[2])
}
}
}
func decodeOctalHost(u *url.URL) {
if len(u.Host) > 0 {
if matches := rxOctalHost.FindStringSubmatch(u.Host); len(matches) > 5 {
var parts [4]int64
for i := 1; i <= 4; i++ {
parts[i-1], _ = strconv.ParseInt(matches[i], 8, 0)
}
u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[5])
}
}
}
func decodeHexHost(u *url.URL) {
if len(u.Host) > 0 {
if matches := rxHexHost.FindStringSubmatch(u.Host); len(matches) > 2 {
// Conversion is safe because of regex validation
parsed, _ := strconv.ParseInt(matches[1], 16, 0)
// Set host as DWORD (base 10) encoded host
u.Host = fmt.Sprintf("%d%s", parsed, matches[2])
// The rest is the same as decoding a DWORD host
decodeDWORDHost(u)
}
}
}
func removeUnncessaryHostDots(u *url.URL) {
if len(u.Host) > 0 {
if matches := rxHostDots.FindStringSubmatch(u.Host); len(matches) > 1 {
// Trim the leading and trailing dots
u.Host = strings.Trim(matches[1], ".")
if len(matches) > 2 {
u.Host += matches[2]
}
}
}
}
func removeEmptyPortSeparator(u *url.URL) {
if len(u.Host) > 0 {
u.Host = rxEmptyPort.ReplaceAllString(u.Host, "")
}
}