blob: 38a9e9c262aa4600f1c6f98fda164c1853f5a89e [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: mbelshe@google.com (Mike Belshe)
// jmarantz@google.com (Joshua Marantz)
#include "pagespeed/kernel/util/url_to_filename_encoder.h"
#include "base/logging.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/http/google_url.h"
namespace net_instaweb {
// The escape character choice is made here -- all code and tests in this
// directory are based off of this constant. However, lots of tests
// have dependencies on this, so it cannot be changed without re-running those
// tests and fixing them.
const char UrlToFilenameEncoder::kEscapeChar = ',';
const char UrlToFilenameEncoder::kTruncationChar = '-';
const size_t UrlToFilenameEncoder::kMaximumSubdirectoryLength = 128;
void UrlToFilenameEncoder::AppendSegment(GoogleString* segment,
GoogleString* dest) {
CHECK(!segment->empty());
if ((*segment == ".") || (*segment == "..")) {
dest->push_back(kEscapeChar);
dest->append(*segment);
segment->clear();
} else {
size_t segment_size = segment->size();
if (segment_size > kMaximumSubdirectoryLength) {
// We need to inject ",-" at the end of the segment to signify that
// we are inserting an artificial '/'. This means we have to chop
// off at least two characters to make room.
segment_size = kMaximumSubdirectoryLength - 2;
// But we don't want to break up an escape sequence that happens to lie at
// the end. Escape sequences are at most 2 characters.
if ((*segment)[segment_size - 1] == kEscapeChar) {
segment_size -= 1;
} else if ((*segment)[segment_size - 2] == kEscapeChar) {
segment_size -= 2;
}
dest->append(segment->data(), segment_size);
dest->push_back(kEscapeChar);
dest->push_back(kTruncationChar);
segment->erase(0, segment_size);
// At this point, if we had segment_size=3, and segment="abcd",
// then after this erase, we will have written "abc,-" and set segment="d"
} else {
dest->append(*segment);
segment->clear();
}
}
}
void UrlToFilenameEncoder::EncodeSegment(const StringPiece& filename_prefix,
const StringPiece& escaped_ending,
char dir_separator,
GoogleString* encoded_filename) {
// We want to unescape URLs so that an %-encodings are cleaned up. However,
// we do not want to convert "+" to " " in this context, since
// "+" is fine in a filename, and " " will be escaped here to ",20" below.
GoogleString filename_ending = GoogleUrl::UnescapeIgnorePlus(escaped_ending);
char encoded[3];
int encoded_len;
GoogleString segment;
// TODO(jmarantz): This code would be a bit simpler if we disallowed
// Instaweb allowing filename_prefix to not end in "/". We could
// then change the is routine to just take one input string.
size_t start_of_segment = filename_prefix.find_last_of(dir_separator);
if (start_of_segment == GoogleString::npos) {
filename_prefix.CopyToString(&segment);
} else {
filename_prefix.substr(start_of_segment + 1).CopyToString(&segment);
filename_prefix.substr(0, start_of_segment + 1).CopyToString(
encoded_filename);
}
size_t index = 0;
// Special case the first / to avoid adding a leading kEscapeChar.
if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) {
encoded_filename->append(segment);
segment.clear();
encoded_filename->push_back(dir_separator);
++index;
}
for (; index < filename_ending.length(); ++index) {
unsigned char ch = static_cast<unsigned char>(filename_ending[index]);
// Note: instead of outputting an empty segment, we let the second slash
// be escaped below.
if ((ch == dir_separator) && !segment.empty()) {
AppendSegment(&segment, encoded_filename);
encoded_filename->push_back(dir_separator);
segment.clear();
} else {
// After removing unsafe chars the only safe ones are _.=+- and alphanums.
if ((ch == '_') || (ch == '.') || (ch == '=') || (ch == '+') ||
(ch == '-') || (('0' <= ch) && (ch <= '9')) ||
(('A' <= ch) && (ch <= 'Z')) || (('a' <= ch) && (ch <= 'z'))) {
encoded[0] = ch;
encoded_len = 1;
} else {
encoded[0] = kEscapeChar;
encoded[1] = ch / 16;
encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
encoded[2] = ch % 16;
encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
encoded_len = 3;
}
segment.append(encoded, encoded_len);
// If segment is too big, we must chop it into chunks.
if (segment.size() > kMaximumSubdirectoryLength) {
AppendSegment(&segment, encoded_filename);
encoded_filename->push_back(dir_separator);
}
}
}
// Append "," to the leaf filename so the leaf can also be a branch., e.g.
// allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and
// /a/b/c/d". So we will rename the "d" here to "d,". If doing that pushed
// us over the 128 char limit, then we will need to append "/" and the
// remaining chars.
segment += kEscapeChar;
AppendSegment(&segment, encoded_filename);
if (!segment.empty()) {
// The last overflow segment is special, because we appended in
// kEscapeChar above. We won't need to check it again for size
// or further escaping.
encoded_filename->push_back(dir_separator);
encoded_filename->append(segment);
}
}
bool UrlToFilenameEncoder::Decode(const StringPiece& encoded_filename,
GoogleString* decoded_url) {
const char kDirSeparator = '/';
enum State {
kStart,
kEscape,
kFirstDigit,
kTruncate,
kEscapeDot
};
State state = kStart;
char hex_buffer[3] = { '\0', '\0', '\0' };
for (int i = 0, n = encoded_filename.size(); i < n; ++i) {
char ch = encoded_filename[i];
switch (state) {
case kStart:
if (ch == UrlToFilenameEncoder::kEscapeChar) {
state = kEscape;
} else if (ch == kDirSeparator) {
decoded_url->push_back('/'); // URLs only use '/' not '\\'
} else {
decoded_url->push_back(ch);
}
break;
case kEscape:
if (IsHexDigit(ch)) {
hex_buffer[0] = ch;
state = kFirstDigit;
} else if (ch == UrlToFilenameEncoder::kTruncationChar) {
state = kTruncate;
} else if (ch == '.') {
decoded_url->push_back('.');
state = kEscapeDot; // Look for at most one more dot.
} else if (ch == kDirSeparator) {
// Consider url "//x". This was once encoded to "/,/x,".
// This code is what skips the first Escape.
decoded_url->push_back('/'); // URLs only use '/' not '\\'
state = kStart;
} else {
return false;
}
break;
case kFirstDigit:
if (IsHexDigit(ch)) {
hex_buffer[1] = ch;
uint32 hex_value = 0;
bool ok = AccumulateHexValue(hex_buffer[0], &hex_value);
ok = ok && AccumulateHexValue(hex_buffer[1], &hex_value);
DCHECK(ok) << "Should not have gotten here unless both were hex";
decoded_url->push_back(static_cast<char>(hex_value));
state = kStart;
} else {
return false;
}
break;
case kTruncate:
if (ch == kDirSeparator) {
// Skip this separator, it was only put in to break up long
// path segments, but is not part of the URL.
state = kStart;
} else {
return false;
}
break;
case kEscapeDot:
decoded_url->push_back(ch);
state = kStart;
break;
}
}
// All legal encoded filenames end in kEscapeChar.
return (state == kEscape);
}
} // namespace net_instaweb