be/src/util/url-parser.cc - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "util/url-parser.h"
 #include "runtime/string-value.inline.h"

 #include "common/names.h"

 namespace impala {

 const StringValue UrlParser::url_authority(const_cast<char*>("AUTHORITY"), 9);
 const StringValue UrlParser::url_file(const_cast<char*>("FILE"), 4);
 const StringValue UrlParser::url_host(const_cast<char*>("HOST"), 4);
 const StringValue UrlParser::url_path(const_cast<char*>("PATH"), 4);
 const StringValue UrlParser::url_protocol(const_cast<char*>("PROTOCOL"), 8);
 const StringValue UrlParser::url_query(const_cast<char*>("QUERY"), 5);
 const StringValue UrlParser::url_ref(const_cast<char*>("REF"), 3);
 const StringValue UrlParser::url_userinfo(const_cast<char*>("USERINFO"), 8);
 const StringValue UrlParser::protocol(const_cast<char*>("://"), 3);
 const StringValue UrlParser::at(const_cast<char*>("@"), 1);
 const StringValue UrlParser::slash(const_cast<char*>("/"), 1);
 const StringValue UrlParser::colon(const_cast<char*>(":"), 1);
 const StringValue UrlParser::question(const_cast<char*>("?"), 1);
 const StringValue UrlParser::hash(const_cast<char*>("#"), 1);
 const StringSearch UrlParser::protocol_search(&protocol);
 const StringSearch UrlParser::at_search(&at);
 const StringSearch UrlParser::slash_search(&slash);
 const StringSearch UrlParser::colon_search(&colon);
 const StringSearch UrlParser::question_search(&question);
 const StringSearch UrlParser::hash_search(&hash);

 bool UrlParser::ParseUrl(const StringValue& url, UrlPart part, StringValue* result) {
   result->ptr = NULL;
   result->len = 0;
   // Remove leading and trailing spaces.
   StringValue trimmed_url = url.Trim();

   // All parts require checking for the protocol.
   int32_t protocol_pos = protocol_search.Search(&trimmed_url);
   if (protocol_pos < 0) return false;
   // Positioned to first char after '://'.
   StringValue protocol_end = trimmed_url.Substring(protocol_pos + protocol.len);

   // Find the end of the authority. The authority ends at the first '/' or '?'.
   int32_t auth_end_pos = -1;
   {
     int32_t first_slash = slash_search.Search(&protocol_end);
     int32_t first_question = question_search.Search(&protocol_end);

     auth_end_pos = first_slash;
     if (first_slash < 0 || (0 <= first_question && first_question < first_slash)) {
       // Either we did not find a slash, or there is one and the first question mark is
       // left of the first slash (after the protocol), for example:
       // http://example.com?dir=/etc
       auth_end_pos = first_question;
     }
   }

   switch(part) {
     case AUTHORITY: {
       *result = protocol_end.Substring(0, auth_end_pos);
       break;
     }

     case FILE:
     case PATH: {
       // Find first '/'.
       int32_t start_pos = slash_search.Search(&protocol_end);
       if (start_pos < 0) {
         // Return empty string. This is what Hive does.
         return true;
       }
       StringValue path_start = protocol_end.Substring(start_pos);
       int32_t end_pos;
       if (part == FILE) {
         // End at '#'.
         end_pos = hash_search.Search(&path_start);
       } else {
         // End string at next '?' or '#'.
         end_pos = question_search.Search(&path_start);
         if (end_pos < 0) {
           // No '?' was found, look for '#'.
           end_pos = hash_search.Search(&path_start);
         }
       }
       *result = path_start.Substring(0, end_pos);
       break;
     }

     case HOST: {
       // The '@' character can occur at three places: in the authority, in the path, or in
       // the query. An example for all three would be:
       // http://user:pass@e.com/get/@me?mail=foo@bar

       // In order to get the host part we first extract the authority and then strip away
       // the userinfo and port.
       StringValue authority = protocol_end.Substring(0, auth_end_pos);

       // Find '@' to strip away userinfo.
       int32_t at_pos = at_search.Search(&authority);
       if (at_pos < 0) {
         // No '@' was found, i.e., no user:pass info was given, so start after protocol.
         at_pos = 0;
       } else {
         // Skip '@'.
         at_pos += at.len;
       }
       StringValue host_and_port = authority.Substring(at_pos);

       // Find ':' to strip out port.
       int32_t colon_pos = colon_search.Search(&host_and_port);
       *result = host_and_port.Substring(0, colon_pos);
       break;
     }

     case PROTOCOL: {
       *result = trimmed_url.Substring(0, protocol_pos);
       break;
     }

     case QUERY: {
       // Find first '?'.
       int32_t start_pos = question_search.Search(&protocol_end);
       if (start_pos < 0) {
         // Indicate no query was found.
         return false;
       }
       StringValue query_start = protocol_end.Substring(start_pos + question.len);
       // End string at next '#'.
       int32_t end_pos = hash_search.Search(&query_start);
       *result = query_start.Substring(0, end_pos);
       break;
     }

     case REF: {
       // Find '#'.
       int32_t start_pos = hash_search.Search(&protocol_end);
       if (start_pos < 0) {
         // Indicate no user and pass were given.
         return false;
       }
       *result = protocol_end.Substring(start_pos + hash.len);
       break;
     }

     case USERINFO: {
       // Find '@'.
       int32_t at_pos = at_search.Search(&protocol_end);
       if (at_pos < 0 || (auth_end_pos > 0 && at_pos > auth_end_pos)) {
         // Indicate no user and pass were given.
         return false;
       }
       *result = protocol_end.Substring(0, at_pos);
       break;
     }

     case INVALID: return false;
   }
   return true;
 }

 bool UrlParser::ParseUrlKey(const StringValue& url, UrlPart part,
       const StringValue& key, StringValue* result) {
   // Part must be query to ask for a specific query key.
   if (part != QUERY) {
     return false;
   }
   // Remove leading and trailing spaces.
   StringValue trimmed_url = url.Trim();

   // Search for the key in the url, ignoring malformed URLs for now.
   StringSearch key_search(&key);
   while(trimmed_url.len > 0) {
     // Search for the key in the current substring.
     int32_t key_pos = key_search.Search(&trimmed_url);
     bool match = true;
     if (key_pos < 0) {
       return false;
     }
     // Key pos must be != 0 because it must be preceded by a '?' or a '&'.
     // Check that the char before key_pos is either '?' or '&'.
     if (key_pos == 0 ||
         (trimmed_url.ptr[key_pos - 1] != '?' && trimmed_url.ptr[key_pos - 1] != '&')) {
       match = false;
     }
     // Advance substring beyond matching key.
     trimmed_url = trimmed_url.Substring(key_pos + key.len);
     if (!match) {
       continue;
     }
     if (trimmed_url.len <= 0) {
       break;
     }
     // Next character must be '=', otherwise the match cannot be a key in the query part.
     if (trimmed_url.ptr[0] != '=') {
       continue;
     }
     int32_t pos = 1;
     // Find ending position of key's value by matching '#' or '&'.
     while(pos < trimmed_url.len) {
       switch(trimmed_url.ptr[pos]) {
         case '#':
         case '&':
           *result = trimmed_url.Substring(1, pos - 1);
           return true;
       }
       ++pos;
     }
     // Ending position is end of string.
     *result = trimmed_url.Substring(1);
     return true;
   }
   return false;
 }

 UrlParser::UrlPart UrlParser::GetUrlPart(const StringValue& part) {
   // Quick filter on requested URL part, based on first character.
   // Hive requires the requested URL part to be all upper case.
   switch(part.ptr[0]) {
     case 'A': {
       if (!part.Eq(url_authority)) return INVALID;
       return AUTHORITY;
     }
     case 'F': {
       if (!part.Eq(url_file)) return INVALID;
       return FILE;
     }
     case 'H': {
       if (!part.Eq(url_host)) return INVALID;
       return HOST;
     }
     case 'P': {
       if (part.Eq(url_path)) {
         return PATH;
       } else if (part.Eq(url_protocol)) {
         return PROTOCOL;
       } else {
         return INVALID;
       }
     }
     case 'Q': {
       if (!part.Eq(url_query)) return INVALID;
       return QUERY;
     }
     case 'R': {
       if (!part.Eq(url_ref)) return INVALID;
       return REF;
     }
     case 'U': {
       if (!part.Eq(url_userinfo)) return INVALID;
       return USERINFO;
     }
     default: return INVALID;
   }
 }

 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include "util/url-parser.h"
	#include "runtime/string-value.inline.h"

	#include "common/names.h"

	namespace impala {

	const StringValue UrlParser::url_authority(const_cast<char*>("AUTHORITY"), 9);
	const StringValue UrlParser::url_file(const_cast<char*>("FILE"), 4);
	const StringValue UrlParser::url_host(const_cast<char*>("HOST"), 4);
	const StringValue UrlParser::url_path(const_cast<char*>("PATH"), 4);
	const StringValue UrlParser::url_protocol(const_cast<char*>("PROTOCOL"), 8);
	const StringValue UrlParser::url_query(const_cast<char*>("QUERY"), 5);
	const StringValue UrlParser::url_ref(const_cast<char*>("REF"), 3);
	const StringValue UrlParser::url_userinfo(const_cast<char*>("USERINFO"), 8);
	const StringValue UrlParser::protocol(const_cast<char*>("://"), 3);
	const StringValue UrlParser::at(const_cast<char*>("@"), 1);
	const StringValue UrlParser::slash(const_cast<char*>("/"), 1);
	const StringValue UrlParser::colon(const_cast<char*>(":"), 1);
	const StringValue UrlParser::question(const_cast<char*>("?"), 1);
	const StringValue UrlParser::hash(const_cast<char*>("#"), 1);
	const StringSearch UrlParser::protocol_search(&protocol);
	const StringSearch UrlParser::at_search(&at);
	const StringSearch UrlParser::slash_search(&slash);
	const StringSearch UrlParser::colon_search(&colon);
	const StringSearch UrlParser::question_search(&question);
	const StringSearch UrlParser::hash_search(&hash);

	bool UrlParser::ParseUrl(const StringValue& url, UrlPart part, StringValue* result) {
	result->ptr = NULL;
	result->len = 0;
	// Remove leading and trailing spaces.
	StringValue trimmed_url = url.Trim();

	// All parts require checking for the protocol.
	int32_t protocol_pos = protocol_search.Search(&trimmed_url);
	if (protocol_pos < 0) return false;
	// Positioned to first char after '://'.
	StringValue protocol_end = trimmed_url.Substring(protocol_pos + protocol.len);

	// Find the end of the authority. The authority ends at the first '/' or '?'.
	int32_t auth_end_pos = -1;
	{
	int32_t first_slash = slash_search.Search(&protocol_end);
	int32_t first_question = question_search.Search(&protocol_end);

	auth_end_pos = first_slash;
	if (first_slash < 0 \|\| (0 <= first_question && first_question < first_slash)) {
	// Either we did not find a slash, or there is one and the first question mark is
	// left of the first slash (after the protocol), for example:
	// http://example.com?dir=/etc
	auth_end_pos = first_question;
	}
	}

	switch(part) {
	case AUTHORITY: {
	*result = protocol_end.Substring(0, auth_end_pos);
	break;
	}

	case FILE:
	case PATH: {
	// Find first '/'.
	int32_t start_pos = slash_search.Search(&protocol_end);
	if (start_pos < 0) {
	// Return empty string. This is what Hive does.
	return true;
	}
	StringValue path_start = protocol_end.Substring(start_pos);
	int32_t end_pos;
	if (part == FILE) {
	// End at '#'.
	end_pos = hash_search.Search(&path_start);
	} else {
	// End string at next '?' or '#'.
	end_pos = question_search.Search(&path_start);
	if (end_pos < 0) {
	// No '?' was found, look for '#'.
	end_pos = hash_search.Search(&path_start);
	}
	}
	*result = path_start.Substring(0, end_pos);
	break;
	}

	case HOST: {
	// The '@' character can occur at three places: in the authority, in the path, or in
	// the query. An example for all three would be:
	// http://user:pass@e.com/get/@me?mail=foo@bar

	// In order to get the host part we first extract the authority and then strip away
	// the userinfo and port.
	StringValue authority = protocol_end.Substring(0, auth_end_pos);

	// Find '@' to strip away userinfo.
	int32_t at_pos = at_search.Search(&authority);
	if (at_pos < 0) {
	// No '@' was found, i.e., no user:pass info was given, so start after protocol.
	at_pos = 0;
	} else {
	// Skip '@'.
	at_pos += at.len;
	}
	StringValue host_and_port = authority.Substring(at_pos);

	// Find ':' to strip out port.
	int32_t colon_pos = colon_search.Search(&host_and_port);
	*result = host_and_port.Substring(0, colon_pos);
	break;
	}

	case PROTOCOL: {
	*result = trimmed_url.Substring(0, protocol_pos);
	break;
	}

	case QUERY: {
	// Find first '?'.
	int32_t start_pos = question_search.Search(&protocol_end);
	if (start_pos < 0) {
	// Indicate no query was found.
	return false;
	}
	StringValue query_start = protocol_end.Substring(start_pos + question.len);
	// End string at next '#'.
	int32_t end_pos = hash_search.Search(&query_start);
	*result = query_start.Substring(0, end_pos);
	break;
	}

	case REF: {
	// Find '#'.
	int32_t start_pos = hash_search.Search(&protocol_end);
	if (start_pos < 0) {
	// Indicate no user and pass were given.
	return false;
	}
	*result = protocol_end.Substring(start_pos + hash.len);
	break;
	}

	case USERINFO: {
	// Find '@'.
	int32_t at_pos = at_search.Search(&protocol_end);
	if (at_pos < 0 \|\| (auth_end_pos > 0 && at_pos > auth_end_pos)) {
	// Indicate no user and pass were given.
	return false;
	}
	*result = protocol_end.Substring(0, at_pos);
	break;
	}

	case INVALID: return false;
	}
	return true;
	}

	bool UrlParser::ParseUrlKey(const StringValue& url, UrlPart part,
	const StringValue& key, StringValue* result) {
	// Part must be query to ask for a specific query key.
	if (part != QUERY) {
	return false;
	}
	// Remove leading and trailing spaces.
	StringValue trimmed_url = url.Trim();

	// Search for the key in the url, ignoring malformed URLs for now.
	StringSearch key_search(&key);
	while(trimmed_url.len > 0) {
	// Search for the key in the current substring.
	int32_t key_pos = key_search.Search(&trimmed_url);
	bool match = true;
	if (key_pos < 0) {
	return false;
	}
	// Key pos must be != 0 because it must be preceded by a '?' or a '&'.
	// Check that the char before key_pos is either '?' or '&'.
	if (key_pos == 0 \|\|
	(trimmed_url.ptr[key_pos - 1] != '?' && trimmed_url.ptr[key_pos - 1] != '&')) {
	match = false;
	}
	// Advance substring beyond matching key.
	trimmed_url = trimmed_url.Substring(key_pos + key.len);
	if (!match) {
	continue;
	}
	if (trimmed_url.len <= 0) {
	break;
	}
	// Next character must be '=', otherwise the match cannot be a key in the query part.
	if (trimmed_url.ptr[0] != '=') {
	continue;
	}
	int32_t pos = 1;
	// Find ending position of key's value by matching '#' or '&'.
	while(pos < trimmed_url.len) {
	switch(trimmed_url.ptr[pos]) {
	case '#':
	case '&':
	*result = trimmed_url.Substring(1, pos - 1);
	return true;
	}
	++pos;
	}
	// Ending position is end of string.
	*result = trimmed_url.Substring(1);
	return true;
	}
	return false;
	}

	UrlParser::UrlPart UrlParser::GetUrlPart(const StringValue& part) {
	// Quick filter on requested URL part, based on first character.
	// Hive requires the requested URL part to be all upper case.
	switch(part.ptr[0]) {
	case 'A': {
	if (!part.Eq(url_authority)) return INVALID;
	return AUTHORITY;
	}
	case 'F': {
	if (!part.Eq(url_file)) return INVALID;
	return FILE;
	}
	case 'H': {
	if (!part.Eq(url_host)) return INVALID;
	return HOST;
	}
	case 'P': {
	if (part.Eq(url_path)) {
	return PATH;
	} else if (part.Eq(url_protocol)) {
	return PROTOCOL;
	} else {
	return INVALID;
	}
	}
	case 'Q': {
	if (!part.Eq(url_query)) return INVALID;
	return QUERY;
	}
	case 'R': {
	if (!part.Eq(url_ref)) return INVALID;
	return REF;
	}
	case 'U': {
	if (!part.Eq(url_userinfo)) return INVALID;
	return USERINFO;
	}
	default: return INVALID;
	}
	}

	}