| /* |
| Licensed to the Apache Software Foundation (ASF) under one |
| or more contributor license agreements. See the NOTICE file |
| distributed with this work for additional information |
| regarding copyright ownership. The ASF licenses this file |
| to you under the Apache License, Version 2.0 (the |
| "License"); you may not use this file except in compliance |
| with the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| */ |
| |
| #include <cstdio> |
| #include <cstring> |
| #include <sys/types.h> |
| #include <cstdlib> |
| #include <climits> |
| #include <cctype> |
| #include <cassert> |
| #include <cinttypes> |
| |
| #include "strip.h" |
| |
| static int copy_whitespace(const char **r, const char *in_end, char **w, const char *out_end); |
| |
| static int strip_whitespace(const char **r, const char **in_end); |
| |
| // Determine if there is room to store len bytes starting at p for an |
| // object that ends at maxp. This is not as simple as a less-than |
| // comparison, because our code may increment p well beyond the end of |
| // the object it originally pointed to (in complete violation of what |
| // ANSI C says is legitimate). The result is that p may wrap around. |
| // This has been observed with using stack buffers as arguments |
| // from 32 bit programs running on 64-bit RHEL. |
| static bool |
| room(const char *p, const int len, const char *maxp) |
| { |
| return ((maxp - (p + len)) >= 0); |
| } |
| |
| // write c into *p if there's room, always incrementing *p. |
| static void |
| write_char_if_room(char **p, const char *maxp, const char c) |
| { |
| if (p == nullptr || *p == nullptr) { |
| return; |
| } |
| |
| if (room(*p, 1, maxp)) { |
| **p = c; |
| } |
| |
| (*p)++; |
| } |
| |
| /// Write count spaces into *p if there's room, always adding count to *p. |
| // The count argument is set to zero at the end of execution. |
| static void |
| write_spaces_if_room(char **p, const char *maxp, int &slen) |
| { |
| if (p == nullptr || *p == nullptr) { |
| return; |
| } |
| |
| if (room(*p, slen, maxp)) { |
| memset(*p, ' ', slen); |
| } |
| |
| *p += slen; |
| slen = 0; |
| } |
| |
| // File-scope data |
| static const unsigned int allowed_flags = (STRIP_FLAG_LEAVE_WHITESP | STRIP_FLAG_STRIP_LOW | STRIP_FLAG_STRIP_HIGH | |
| STRIP_FLAG_UNSAFE_QUOTES | STRIP_FLAG_UNSAFE_SLASHES | STRIP_FLAG_UNSAFE_SPACES); |
| |
| static int |
| stripped_core(const char *r, const char *in_end, char **w, const char *out_end, unsigned int flags) |
| { |
| int leading = 1; /* haven't yet written a non-space */ |
| int in_js_entity = 0; /* are we inside a javascript entity? */ |
| char in_quote_char = '\0'; /* in quoted region? which kind: '\'' or '"' */ |
| int space = 0; /* number of spaces pending */ |
| int stripped = 0; /* have we stripped since last output? */ |
| int in_tag = 0; /* are we inside a tag? */ |
| |
| /* parse the string, stripping risky characters/sequences */ |
| for (/* already established */; r < in_end; r++) { |
| unsigned char c = static_cast<unsigned char>(*r); |
| if (in_tag) { |
| switch (c) { |
| case '>': |
| if (!in_quote_char) { |
| in_tag = 0; |
| } |
| break; |
| |
| case '"': |
| case '\'': |
| if (!in_quote_char) { |
| in_quote_char = c; |
| } else if (in_quote_char == c) { |
| in_quote_char = '\0'; |
| } |
| break; |
| |
| default: |
| break; /* eat everything between < and > */ |
| } |
| } else if (in_js_entity) { |
| switch (c) { |
| case '}': |
| if (!in_quote_char) { |
| in_js_entity = 0; |
| if (r + 1 < in_end && *(r + 1) == ';') { |
| r++; |
| } |
| } |
| break; |
| |
| case '"': |
| case '\'': |
| if (!in_quote_char) { |
| in_quote_char = c; |
| } else if (in_quote_char == c) { |
| in_quote_char = '\0'; |
| } |
| break; |
| |
| default: |
| break; /* eat everything between < and > */ |
| } |
| } else { |
| if (c == '<') { |
| in_tag = 1; |
| stripped = 1; |
| } else if (c == '&' && r + 1 < in_end && *(r + 1) == '{') { |
| in_js_entity = 1; |
| stripped = 1; |
| r++; |
| } else if ((c < 0x07 && (flags & STRIP_FLAG_STRIP_LOW)) || (c >= 0x80 && (flags & STRIP_FLAG_STRIP_HIGH)) || |
| (c == '"' && !(flags & STRIP_FLAG_UNSAFE_QUOTES)) || (c == '\'' && !(flags & STRIP_FLAG_UNSAFE_QUOTES)) || |
| (c == '\\' && !(flags & STRIP_FLAG_UNSAFE_SLASHES)) || c == '>') { |
| stripped = 1; |
| } else if (c == ' ') { |
| space++; /* don't collapse existing spaces */ |
| } else { |
| /* we're ready to write an output character */ |
| if (leading) { |
| leading = 0; /* first non-whitespace character */ |
| stripped = 0; |
| if (!(flags & STRIP_FLAG_LEAVE_WHITESP)) { |
| space = 0; |
| } |
| } |
| |
| /* flush pending spaces */ |
| if (!space && stripped && !(flags & STRIP_FLAG_UNSAFE_SPACES)) { |
| space = 1; /* replace stripped sequence with space */ |
| } |
| stripped = 0; /* reset until next stripped sequence */ |
| write_spaces_if_room(w, out_end, space); |
| |
| /* Process as single character. */ |
| write_char_if_room(w, out_end, c); |
| } |
| } |
| } |
| |
| /* Restore trailing whitespace if asked */ |
| if (flags & STRIP_FLAG_LEAVE_WHITESP) { |
| write_spaces_if_room(w, out_end, space); |
| } |
| |
| return STRIP_RESULT_OK; |
| } |
| |
| int |
| get_stripped(const char *in, ssize_t in_len, char *out, int *out_len, unsigned int flags) |
| { |
| int retval = STRIP_RESULT_OK; |
| const char *r, *in_end; /* where we read from, read limit */ |
| char *w, *out_end; /* where we write to, write limit */ |
| |
| /* validate params */ |
| if (in == nullptr || in_len < 0 || out_len == nullptr || *out_len < 0 || (out == nullptr && *out_len > 0) || |
| (flags & (~allowed_flags))) { |
| if (out != nullptr && out_len != nullptr && *out_len > 0) { |
| *out = '\0'; |
| *out_len = 1; |
| } |
| return STRIP_RESULT_BAD_PARAM; |
| } |
| |
| /* make room for null terminator in output and remove if present in in */ |
| (*out_len) -= out ? 1 : 0; /* make space for '\0' unless NULL out */ |
| if (in_len > 0 && in[in_len - 1] == '\0') { |
| in_len--; /* don't count null terminator in input */ |
| } |
| |
| /* establish our read and write limits */ |
| r = in; |
| w = out; |
| in_end = in + in_len; |
| out_end = out + *out_len; |
| |
| /* strip leading and trailing whitespace, unless asked not to */ |
| if (!(flags & STRIP_FLAG_LEAVE_WHITESP)) { |
| strip_whitespace(&r, &in_end); |
| } else { |
| copy_whitespace(&r, in_end, &w, out_end); |
| } |
| |
| /* handle empty input case (null terminated or not) */ |
| if ((!(flags & STRIP_FLAG_LEAVE_WHITESP) && r >= in_end) || ((flags & STRIP_FLAG_LEAVE_WHITESP) && in_len == 0)) { |
| write_char_if_room(&w, out_end, '\0'); /* make out empty string */ |
| *out_len = 1; |
| return STRIP_RESULT_EMPTY_IN; /* input is empty string */ |
| } |
| |
| /* call the core function that does actual checking and stripping */ |
| retval = stripped_core(r, in_end, &w, out_end, flags); |
| |
| /* null terminate */ |
| out_end += out_end ? 1 : 0; /* undo decrement at start */ |
| write_char_if_room(&w, out_end, '\0'); /* try to term at end of output */ |
| |
| /* report the required/used length */ |
| *out_len = w - out; |
| |
| /* see if we ran out of space, but were otherwise ok */ |
| if (w > out_end && retval == STRIP_RESULT_OK) { |
| retval = STRIP_RESULT_OUTLEN_SMALL; |
| } |
| |
| if (retval != STRIP_RESULT_OK) { |
| /* return the empty string on all errors */ |
| write_char_if_room(&out, out_end, '\0'); /* make out the empty string */ |
| if (retval != STRIP_RESULT_OUTLEN_SMALL) { |
| *out_len = 1; /* even if retried, we won't use more than 1 byte */ |
| } |
| } |
| |
| return retval; |
| } |
| |
| /* |
| * Copy sequence of whitespace from r to w |
| */ |
| static int |
| copy_whitespace(const char **r, const char *in_end, char **w, const char *out_end) |
| { |
| char c; |
| while (*r < in_end && (c = **r) && (c == ' ' || c == '\t' || c == '\r' || c == '\n')) { |
| write_char_if_room(w, out_end, c); |
| (*r)++; |
| } |
| return 0; |
| } |
| |
| static int |
| strip_whitespace(const char **r, const char **in_end) |
| { |
| char c; |
| while (*r < *in_end && (c = **r) && (c == ' ' || c == '\t' || c == '\r' || c == '\n')) { |
| (*r)++; |
| } |
| while (*in_end > *r && (c = *((*in_end) - 1)) && (c == ' ' || c == '\t' || c == '\r' || c == '\n')) { |
| (*in_end)--; |
| } |
| return 0; |
| } |