| # encoding:utf-8 |
| #-- |
| # Copyright (C) Bob Aman |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| #++ |
| |
| |
| module Addressable |
| module IDNA |
| # This module is loosely based on idn_actionmailer by Mick Staugaard, |
| # the unicode library by Yoshida Masato, and the punycode implementation |
| # by Kazuhiro Nishiyama. Most of the code was copied verbatim, but |
| # some reformatting was done, and some translation from C was done. |
| # |
| # Without their code to work from as a base, we'd all still be relying |
| # on the presence of libidn. Which nobody ever seems to have installed. |
| # |
| # Original sources: |
| # http://github.com/staugaard/idn_actionmailer |
| # http://www.yoshidam.net/Ruby.html#unicode |
| # http://rubyforge.org/frs/?group_id=2550 |
| |
| |
| UNICODE_TABLE = File.expand_path( |
| File.join(File.dirname(__FILE__), '../../..', 'data/unicode.data') |
| ) |
| |
| ACE_PREFIX = "xn--" |
| |
| UTF8_REGEX = /\A(?: |
| [\x09\x0A\x0D\x20-\x7E] # ASCII |
| | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte |
| | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs |
| | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte |
| | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates |
| | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 |
| | [\xF1-\xF3][\x80-\xBF]{3} # planes 4nil5 |
| | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 |
| )*\z/mnx |
| |
| UTF8_REGEX_MULTIBYTE = /(?: |
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte |
| | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs |
| | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte |
| | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates |
| | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 |
| | [\xF1-\xF3][\x80-\xBF]{3} # planes 4nil5 |
| | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 |
| )/mnx |
| |
| # :startdoc: |
| |
| # Converts from a Unicode internationalized domain name to an ASCII |
| # domain name as described in RFC 3490. |
| def self.to_ascii(input) |
| input = input.to_s unless input.is_a?(String) |
| input = input.dup |
| if input.respond_to?(:force_encoding) |
| input.force_encoding(Encoding::ASCII_8BIT) |
| end |
| if input =~ UTF8_REGEX && input =~ UTF8_REGEX_MULTIBYTE |
| parts = unicode_downcase(input).split('.') |
| parts.map! do |part| |
| if part.respond_to?(:force_encoding) |
| part.force_encoding(Encoding::ASCII_8BIT) |
| end |
| if part =~ UTF8_REGEX && part =~ UTF8_REGEX_MULTIBYTE |
| ACE_PREFIX + punycode_encode(unicode_normalize_kc(part)) |
| else |
| part |
| end |
| end |
| parts.join('.') |
| else |
| input |
| end |
| end |
| |
| # Converts from an ASCII domain name to a Unicode internationalized |
| # domain name as described in RFC 3490. |
| def self.to_unicode(input) |
| input = input.to_s unless input.is_a?(String) |
| parts = input.split('.') |
| parts.map! do |part| |
| if part =~ /^#{ACE_PREFIX}(.+)/ |
| begin |
| punycode_decode(part[/^#{ACE_PREFIX}(.+)/, 1]) |
| rescue Addressable::IDNA::PunycodeBadInput |
| # toUnicode is explicitly defined as never-fails by the spec |
| part |
| end |
| else |
| part |
| end |
| end |
| output = parts.join('.') |
| if output.respond_to?(:force_encoding) |
| output.force_encoding(Encoding::UTF_8) |
| end |
| output |
| end |
| |
| # Unicode normalization form KC. |
| def self.unicode_normalize_kc(input) |
| input = input.to_s unless input.is_a?(String) |
| unpacked = input.unpack("U*") |
| unpacked = |
| unicode_compose(unicode_sort_canonical(unicode_decompose(unpacked))) |
| return unpacked.pack("U*") |
| end |
| |
| ## |
| # Unicode aware downcase method. |
| # |
| # @api private |
| # @param [String] input |
| # The input string. |
| # @return [String] The downcased result. |
| def self.unicode_downcase(input) |
| input = input.to_s unless input.is_a?(String) |
| unpacked = input.unpack("U*") |
| unpacked.map! { |codepoint| lookup_unicode_lowercase(codepoint) } |
| return unpacked.pack("U*") |
| end |
| (class <<self; private :unicode_downcase; end) |
| |
| def self.unicode_compose(unpacked) |
| unpacked_result = [] |
| length = unpacked.length |
| |
| return unpacked if length == 0 |
| |
| starter = unpacked[0] |
| starter_cc = lookup_unicode_combining_class(starter) |
| starter_cc = 256 if starter_cc != 0 |
| for i in 1...length |
| ch = unpacked[i] |
| cc = lookup_unicode_combining_class(ch) |
| |
| if (starter_cc == 0 && |
| (composite = unicode_compose_pair(starter, ch)) != nil) |
| starter = composite |
| startercc = lookup_unicode_combining_class(composite) |
| else |
| unpacked_result << starter |
| starter = ch |
| startercc = cc |
| end |
| end |
| unpacked_result << starter |
| return unpacked_result |
| end |
| (class <<self; private :unicode_compose; end) |
| |
| def self.unicode_compose_pair(ch_one, ch_two) |
| if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT && |
| ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT |
| # Hangul L + V |
| return HANGUL_SBASE + ( |
| (ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE) |
| ) * HANGUL_TCOUNT |
| elsif ch_one >= HANGUL_SBASE && |
| ch_one < HANGUL_SBASE + HANGUL_SCOUNT && |
| (ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 && |
| ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT |
| # Hangul LV + T |
| return ch_one + (ch_two - HANGUL_TBASE) |
| end |
| |
| p = [] |
| ucs4_to_utf8 = lambda do |ch| |
| if ch < 128 |
| p << ch |
| elsif ch < 2048 |
| p << (ch >> 6 | 192) |
| p << (ch & 63 | 128) |
| elsif ch < 0x10000 |
| p << (ch >> 12 | 224) |
| p << (ch >> 6 & 63 | 128) |
| p << (ch & 63 | 128) |
| elsif ch < 0x200000 |
| p << (ch >> 18 | 240) |
| p << (ch >> 12 & 63 | 128) |
| p << (ch >> 6 & 63 | 128) |
| p << (ch & 63 | 128) |
| elsif ch < 0x4000000 |
| p << (ch >> 24 | 248) |
| p << (ch >> 18 & 63 | 128) |
| p << (ch >> 12 & 63 | 128) |
| p << (ch >> 6 & 63 | 128) |
| p << (ch & 63 | 128) |
| elsif ch < 0x80000000 |
| p << (ch >> 30 | 252) |
| p << (ch >> 24 & 63 | 128) |
| p << (ch >> 18 & 63 | 128) |
| p << (ch >> 12 & 63 | 128) |
| p << (ch >> 6 & 63 | 128) |
| p << (ch & 63 | 128) |
| end |
| end |
| |
| ucs4_to_utf8.call(ch_one) |
| ucs4_to_utf8.call(ch_two) |
| |
| return lookup_unicode_composition(p) |
| end |
| (class <<self; private :unicode_compose_pair; end) |
| |
| def self.unicode_sort_canonical(unpacked) |
| unpacked = unpacked.dup |
| i = 1 |
| length = unpacked.length |
| |
| return unpacked if length < 2 |
| |
| while i < length |
| last = unpacked[i-1] |
| ch = unpacked[i] |
| last_cc = lookup_unicode_combining_class(last) |
| cc = lookup_unicode_combining_class(ch) |
| if cc != 0 && last_cc != 0 && last_cc > cc |
| unpacked[i] = last |
| unpacked[i-1] = ch |
| i -= 1 if i > 1 |
| else |
| i += 1 |
| end |
| end |
| return unpacked |
| end |
| (class <<self; private :unicode_sort_canonical; end) |
| |
| def self.unicode_decompose(unpacked) |
| unpacked_result = [] |
| for cp in unpacked |
| if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT |
| l, v, t = unicode_decompose_hangul(cp) |
| unpacked_result << l |
| unpacked_result << v if v |
| unpacked_result << t if t |
| else |
| dc = lookup_unicode_compatibility(cp) |
| unless dc |
| unpacked_result << cp |
| else |
| unpacked_result.concat(unicode_decompose(dc.unpack("U*"))) |
| end |
| end |
| end |
| return unpacked_result |
| end |
| (class <<self; private :unicode_decompose; end) |
| |
| def self.unicode_decompose_hangul(codepoint) |
| sindex = codepoint - HANGUL_SBASE; |
| if sindex < 0 || sindex >= HANGUL_SCOUNT |
| l = codepoint |
| v = t = nil |
| return l, v, t |
| end |
| l = HANGUL_LBASE + sindex / HANGUL_NCOUNT |
| v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT |
| t = HANGUL_TBASE + sindex % HANGUL_TCOUNT |
| if t == HANGUL_TBASE |
| t = nil |
| end |
| return l, v, t |
| end |
| (class <<self; private :unicode_decompose_hangul; end) |
| |
| def self.lookup_unicode_combining_class(codepoint) |
| codepoint_data = UNICODE_DATA[codepoint] |
| (codepoint_data ? |
| (codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) : |
| 0) |
| end |
| (class <<self; private :lookup_unicode_combining_class; end) |
| |
| def self.lookup_unicode_compatibility(codepoint) |
| codepoint_data = UNICODE_DATA[codepoint] |
| (codepoint_data ? |
| codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil) |
| end |
| (class <<self; private :lookup_unicode_compatibility; end) |
| |
| def self.lookup_unicode_lowercase(codepoint) |
| codepoint_data = UNICODE_DATA[codepoint] |
| (codepoint_data ? |
| (codepoint_data[UNICODE_DATA_LOWERCASE] || codepoint) : |
| codepoint) |
| end |
| (class <<self; private :lookup_unicode_lowercase; end) |
| |
| def self.lookup_unicode_composition(unpacked) |
| return COMPOSITION_TABLE[unpacked] |
| end |
| (class <<self; private :lookup_unicode_composition; end) |
| |
| HANGUL_SBASE = 0xac00 |
| HANGUL_LBASE = 0x1100 |
| HANGUL_LCOUNT = 19 |
| HANGUL_VBASE = 0x1161 |
| HANGUL_VCOUNT = 21 |
| HANGUL_TBASE = 0x11a7 |
| HANGUL_TCOUNT = 28 |
| HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT # 588 |
| HANGUL_SCOUNT = HANGUL_LCOUNT * HANGUL_NCOUNT # 11172 |
| |
| UNICODE_DATA_COMBINING_CLASS = 0 |
| UNICODE_DATA_EXCLUSION = 1 |
| UNICODE_DATA_CANONICAL = 2 |
| UNICODE_DATA_COMPATIBILITY = 3 |
| UNICODE_DATA_UPPERCASE = 4 |
| UNICODE_DATA_LOWERCASE = 5 |
| UNICODE_DATA_TITLECASE = 6 |
| |
| begin |
| if defined?(FakeFS) |
| fakefs_state = FakeFS.activated? |
| FakeFS.deactivate! |
| end |
| # This is a sparse Unicode table. Codepoints without entries are |
| # assumed to have the value: [0, 0, nil, nil, nil, nil, nil] |
| UNICODE_DATA = File.open(UNICODE_TABLE, "rb") do |file| |
| Marshal.load(file.read) |
| end |
| ensure |
| if defined?(FakeFS) |
| FakeFS.activate! if fakefs_state |
| end |
| end |
| |
| COMPOSITION_TABLE = {} |
| for codepoint, data in UNICODE_DATA |
| canonical = data[UNICODE_DATA_CANONICAL] |
| exclusion = data[UNICODE_DATA_EXCLUSION] |
| |
| if canonical && exclusion == 0 |
| COMPOSITION_TABLE[canonical.unpack("C*")] = codepoint |
| end |
| end |
| |
| UNICODE_MAX_LENGTH = 256 |
| ACE_MAX_LENGTH = 256 |
| |
| PUNYCODE_BASE = 36 |
| PUNYCODE_TMIN = 1 |
| PUNYCODE_TMAX = 26 |
| PUNYCODE_SKEW = 38 |
| PUNYCODE_DAMP = 700 |
| PUNYCODE_INITIAL_BIAS = 72 |
| PUNYCODE_INITIAL_N = 0x80 |
| PUNYCODE_DELIMITER = 0x2D |
| |
| PUNYCODE_MAXINT = 1 << 64 |
| |
| PUNYCODE_PRINT_ASCII = |
| "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" + |
| "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" + |
| " !\"\#$%&'()*+,-./" + |
| "0123456789:;<=>?" + |
| "@ABCDEFGHIJKLMNO" + |
| "PQRSTUVWXYZ[\\]^_" + |
| "`abcdefghijklmno" + |
| "pqrstuvwxyz{|}~\n" |
| |
| # Input is invalid. |
| class PunycodeBadInput < StandardError; end |
| # Output would exceed the space provided. |
| class PunycodeBigOutput < StandardError; end |
| # Input needs wider integers to process. |
| class PunycodeOverflow < StandardError; end |
| |
| def self.punycode_encode(unicode) |
| unicode = unicode.to_s unless unicode.is_a?(String) |
| input = unicode.unpack("U*") |
| output = [0] * (ACE_MAX_LENGTH + 1) |
| input_length = input.size |
| output_length = [ACE_MAX_LENGTH] |
| |
| # Initialize the state |
| n = PUNYCODE_INITIAL_N |
| delta = out = 0 |
| max_out = output_length[0] |
| bias = PUNYCODE_INITIAL_BIAS |
| |
| # Handle the basic code points: |
| input_length.times do |j| |
| if punycode_basic?(input[j]) |
| if max_out - out < 2 |
| raise PunycodeBigOutput, |
| "Output would exceed the space provided." |
| end |
| output[out] = input[j] |
| out += 1 |
| end |
| end |
| |
| h = b = out |
| |
| # h is the number of code points that have been handled, b is the |
| # number of basic code points, and out is the number of characters |
| # that have been output. |
| |
| if b > 0 |
| output[out] = PUNYCODE_DELIMITER |
| out += 1 |
| end |
| |
| # Main encoding loop: |
| |
| while h < input_length |
| # All non-basic code points < n have been |
| # handled already. Find the next larger one: |
| |
| m = PUNYCODE_MAXINT |
| input_length.times do |j| |
| m = input[j] if (n...m) === input[j] |
| end |
| |
| # Increase delta enough to advance the decoder's |
| # <n,i> state to <m,0>, but guard against overflow: |
| |
| if m - n > (PUNYCODE_MAXINT - delta) / (h + 1) |
| raise PunycodeOverflow, "Input needs wider integers to process." |
| end |
| delta += (m - n) * (h + 1) |
| n = m |
| |
| input_length.times do |j| |
| # Punycode does not need to check whether input[j] is basic: |
| if input[j] < n |
| delta += 1 |
| if delta == 0 |
| raise PunycodeOverflow, |
| "Input needs wider integers to process." |
| end |
| end |
| |
| if input[j] == n |
| # Represent delta as a generalized variable-length integer: |
| |
| q = delta; k = PUNYCODE_BASE |
| while true |
| if out >= max_out |
| raise PunycodeBigOutput, |
| "Output would exceed the space provided." |
| end |
| t = ( |
| if k <= bias |
| PUNYCODE_TMIN |
| elsif k >= bias + PUNYCODE_TMAX |
| PUNYCODE_TMAX |
| else |
| k - bias |
| end |
| ) |
| break if q < t |
| output[out] = |
| punycode_encode_digit(t + (q - t) % (PUNYCODE_BASE - t)) |
| out += 1 |
| q = (q - t) / (PUNYCODE_BASE - t) |
| k += PUNYCODE_BASE |
| end |
| |
| output[out] = punycode_encode_digit(q) |
| out += 1 |
| bias = punycode_adapt(delta, h + 1, h == b) |
| delta = 0 |
| h += 1 |
| end |
| end |
| |
| delta += 1 |
| n += 1 |
| end |
| |
| output_length[0] = out |
| |
| outlen = out |
| outlen.times do |j| |
| c = output[j] |
| unless c >= 0 && c <= 127 |
| raise StandardError, "Invalid output char." |
| end |
| unless PUNYCODE_PRINT_ASCII[c] |
| raise PunycodeBadInput, "Input is invalid." |
| end |
| end |
| |
| output[0..outlen].map { |x| x.chr }.join("").sub(/\0+\z/, "") |
| end |
| (class <<self; private :punycode_encode; end) |
| |
| def self.punycode_decode(punycode) |
| input = [] |
| output = [] |
| |
| if ACE_MAX_LENGTH * 2 < punycode.size |
| raise PunycodeBigOutput, "Output would exceed the space provided." |
| end |
| punycode.each_byte do |c| |
| unless c >= 0 && c <= 127 |
| raise PunycodeBadInput, "Input is invalid." |
| end |
| input.push(c) |
| end |
| |
| input_length = input.length |
| output_length = [UNICODE_MAX_LENGTH] |
| |
| # Initialize the state |
| n = PUNYCODE_INITIAL_N |
| |
| out = i = 0 |
| max_out = output_length[0] |
| bias = PUNYCODE_INITIAL_BIAS |
| |
| # Handle the basic code points: Let b be the number of input code |
| # points before the last delimiter, or 0 if there is none, then |
| # copy the first b code points to the output. |
| |
| b = 0 |
| input_length.times do |j| |
| b = j if punycode_delimiter?(input[j]) |
| end |
| if b > max_out |
| raise PunycodeBigOutput, "Output would exceed the space provided." |
| end |
| |
| b.times do |j| |
| unless punycode_basic?(input[j]) |
| raise PunycodeBadInput, "Input is invalid." |
| end |
| output[out] = input[j] |
| out+=1 |
| end |
| |
| # Main decoding loop: Start just after the last delimiter if any |
| # basic code points were copied; start at the beginning otherwise. |
| |
| in_ = b > 0 ? b + 1 : 0 |
| while in_ < input_length |
| |
| # in_ is the index of the next character to be consumed, and |
| # out is the number of code points in the output array. |
| |
| # Decode a generalized variable-length integer into delta, |
| # which gets added to i. The overflow checking is easier |
| # if we increase i as we go, then subtract off its starting |
| # value at the end to obtain delta. |
| |
| oldi = i; w = 1; k = PUNYCODE_BASE |
| while true |
| if in_ >= input_length |
| raise PunycodeBadInput, "Input is invalid." |
| end |
| digit = punycode_decode_digit(input[in_]) |
| in_+=1 |
| if digit >= PUNYCODE_BASE |
| raise PunycodeBadInput, "Input is invalid." |
| end |
| if digit > (PUNYCODE_MAXINT - i) / w |
| raise PunycodeOverflow, "Input needs wider integers to process." |
| end |
| i += digit * w |
| t = ( |
| if k <= bias |
| PUNYCODE_TMIN |
| elsif k >= bias + PUNYCODE_TMAX |
| PUNYCODE_TMAX |
| else |
| k - bias |
| end |
| ) |
| break if digit < t |
| if w > PUNYCODE_MAXINT / (PUNYCODE_BASE - t) |
| raise PunycodeOverflow, "Input needs wider integers to process." |
| end |
| w *= PUNYCODE_BASE - t |
| k += PUNYCODE_BASE |
| end |
| |
| bias = punycode_adapt(i - oldi, out + 1, oldi == 0) |
| |
| # I was supposed to wrap around from out + 1 to 0, |
| # incrementing n each time, so we'll fix that now: |
| |
| if i / (out + 1) > PUNYCODE_MAXINT - n |
| raise PunycodeOverflow, "Input needs wider integers to process." |
| end |
| n += i / (out + 1) |
| i %= out + 1 |
| |
| # Insert n at position i of the output: |
| |
| # not needed for Punycode: |
| # raise PUNYCODE_INVALID_INPUT if decode_digit(n) <= base |
| if out >= max_out |
| raise PunycodeBigOutput, "Output would exceed the space provided." |
| end |
| |
| #memmove(output + i + 1, output + i, (out - i) * sizeof *output) |
| output[i + 1, out - i] = output[i, out - i] |
| output[i] = n |
| i += 1 |
| |
| out += 1 |
| end |
| |
| output_length[0] = out |
| |
| output.pack("U*") |
| end |
| (class <<self; private :punycode_decode; end) |
| |
| def self.punycode_basic?(codepoint) |
| codepoint < 0x80 |
| end |
| (class <<self; private :punycode_basic?; end) |
| |
| def self.punycode_delimiter?(codepoint) |
| codepoint == PUNYCODE_DELIMITER |
| end |
| (class <<self; private :punycode_delimiter?; end) |
| |
| def self.punycode_encode_digit(d) |
| d + 22 + 75 * ((d < 26) ? 1 : 0) |
| end |
| (class <<self; private :punycode_encode_digit; end) |
| |
| # Returns the numeric value of a basic codepoint |
| # (for use in representing integers) in the range 0 to |
| # base - 1, or PUNYCODE_BASE if codepoint does not represent a value. |
| def self.punycode_decode_digit(codepoint) |
| if codepoint - 48 < 10 |
| codepoint - 22 |
| elsif codepoint - 65 < 26 |
| codepoint - 65 |
| elsif codepoint - 97 < 26 |
| codepoint - 97 |
| else |
| PUNYCODE_BASE |
| end |
| end |
| (class <<self; private :punycode_decode_digit; end) |
| |
| # Bias adaptation method |
| def self.punycode_adapt(delta, numpoints, firsttime) |
| delta = firsttime ? delta / PUNYCODE_DAMP : delta >> 1 |
| # delta >> 1 is a faster way of doing delta / 2 |
| delta += delta / numpoints |
| difference = PUNYCODE_BASE - PUNYCODE_TMIN |
| |
| k = 0 |
| while delta > (difference * PUNYCODE_TMAX) / 2 |
| delta /= difference |
| k += PUNYCODE_BASE |
| end |
| |
| k + (difference + 1) * delta / (delta + PUNYCODE_SKEW) |
| end |
| (class <<self; private :punycode_adapt; end) |
| end |
| # :startdoc: |
| end |