util_pcre: Add a thread local subpool cache for when stack does not suffice.

When AP_HAS_THREAD_LOCAL is available, use a thread-local match_thread_state to
save per-thread data in a subpool of the thread's pool.

If private_malloc() gets out of the stack buffer and the current thread has a
pool (i.e. ap_thread_current() != NULL), it will apr_palloc()ate and return
memory from the subpool.

When the match is complete and the match_data are freed, the thread subpool is
cleared thus giving back the memory to the allocator, which itself will give
back the memory or recycle it depending on its max_free setting.

* util_pcre.c:
  Restore POSIX_MALLOC_THRESHOLDsince this is part of the user API.

* util_pcre.c(match_data_pt):
  Type not used (explicitely) anymore, axe.
  
* util_pcre.c(struct match_data_state):
  Put the stack buffer there to simplify code (the state is allocated on
  stack anyway).
  If APREG_USE_THREAD_LOCAL, add the apr_thread_t* and match_thread_state*
  fields that track the thread local data for the match.

* util_pcre.c(alloc_match_data, free_match):
  Renamed to setup_state() and cleanup_state(), simplified (no stack buffer
  parameters anymore).
  cleanup_state() now clears the thread local subpool if used during the match.
  setup_state() set state->thd to ap_thread_current(), thus NULL if it's not a
  suitable thread for using thread local data.

* util_pcre.c(private_malloc):
  Fix a possible buf_used overflow (size <= avail < APR_ALIGN_DEFAULT(size)).
  Create the thread local subpool (once per thread) and allocate from there
  when stack space is missing and state->thd != NULL, otherwise fall back to
  malloc() still.

* util_pcre.c(private_free):
  Do nothing for thread local subpool memory, will be freed in cleanup_state
  eventually.



git-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@1902731 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/server/util_pcre.c b/server/util_pcre.c
index 11b13d6..53096ea 100644
--- a/server/util_pcre.c
+++ b/server/util_pcre.c
@@ -73,8 +73,19 @@
 #define APR_WANT_STRFUNC
 #include "apr_want.h"
 
-#ifndef AP_PCRE_STACKBUF_SIZE
-#define AP_PCRE_STACKBUF_SIZE (256)
+#ifndef POSIX_MALLOC_THRESHOLD
+#define POSIX_MALLOC_THRESHOLD (10)
+#endif
+
+#ifdef HAVE_PCRE2
+/* Reserve 128 bytes for the PCRE2 structs, that is (a bit above):
+ *   sizeof(pcre2_general_context) + offsetof(pcre2_match_data, ovector)
+ */
+#define AP_PCRE_STACKBUF_SIZE \
+    (128 + POSIX_MALLOC_THRESHOLD * sizeof(PCRE2_SIZE) * 2)
+#else
+#define AP_PCRE_STACKBUF_SIZE \
+    (POSIX_MALLOC_THRESHOLD * sizeof(int) * 3)
 #endif
 
 /* Table of error strings corresponding to POSIX error codes; must be
@@ -202,7 +213,7 @@
 {
 #ifdef HAVE_PCRE2
     uint32_t capcount;
-    size_t erroffset;
+    PCRE2_SIZE erroffset;
 #else
     const char *errorptr;
     int erroffset;
@@ -257,105 +268,201 @@
  *              Match a regular expression       *
  *************************************************/
 
-/* Unfortunately, PCRE requires 3 ints of working space for each captured
+/* Unfortunately, PCRE1 requires 3 ints of working space for each captured
  * substring, so we have to get and release working store instead of just using
  * the POSIX structures as was done in earlier releases when PCRE needed only 2
  * ints. However, if the number of possible capturing brackets is small, use a
  * block of store on the stack, to reduce the use of malloc/free. The threshold
- * is in a macro that can be changed at configure time.
- * Yet more unfortunately, PCRE2 wants an opaque context by providing the API
- * to allocate and free it, so to minimize these calls we maintain one opaque
- * context per thread (in Thread Local Storage, TLS) grown as needed, and while
- * at it we do the same for PCRE1 ints vectors. Note that this requires a fast
- * TLS mechanism to be worth it, which is the case of apr_thread_data_get/set()
- * from/to ap_thread_current() when AP_HAS_THREAD_LOCAL; otherwise we'll do
- * the allocation and freeing for each ap_regexec().
+ * is in POSIX_MALLOC_THRESHOLD macro that can be changed at configure time.
+ * PCRE2 takes an opaque match context and lets us provide the callbacks to
+ * manage the memory needed during the match, so we can still use a small stack
+ * space that'll suffice for regexes up to POSIX_MALLOC_THRESHOLD captures (and
+ * not too complex), above that either use some thread local subpool cache (if
+ * AP_HAS_THREAD_LOCAL) or fall back to malloc()/free().
  */
 
-#ifdef HAVE_PCRE2
-typedef pcre2_match_data* match_data_pt;
-typedef size_t*           match_vector_pt;
+#if AP_HAS_THREAD_LOCAL && !defined(APREG_NO_THREAD_LOCAL)
+#define APREG_USE_THREAD_LOCAL 1
 #else
-typedef int*              match_data_pt;
-typedef int*              match_vector_pt;
+#define APREG_USE_THREAD_LOCAL 0
 #endif
 
-struct match_data_state
-{
-    match_data_pt data;
-    char *buf;
-    apr_size_t buf_len;
+#ifdef HAVE_PCRE2
+typedef PCRE2_SIZE* match_vector_pt;
+#else
+typedef int*        match_vector_pt;
+#endif
+
+#if APREG_USE_THREAD_LOCAL
+struct match_thread_state {
+    char *heap;
+    apr_size_t heap_size;
+    apr_size_t heap_used;
+    apr_pool_t *pool;
+};
+
+AP_THREAD_LOCAL static struct match_thread_state *thread_state;
+#endif
+
+struct match_data_state {
+    /* keep first, struct aligned */
+    char buf[AP_PCRE_STACKBUF_SIZE];
     apr_size_t buf_used;
+
+#if APREG_USE_THREAD_LOCAL
+    apr_thread_t *thd;
+    struct match_thread_state *ts;
+#endif
+
 #ifdef HAVE_PCRE2
     pcre2_general_context *pcre2_ctx;
+    pcre2_match_data* match_data;
+#else
+    int *match_data;
 #endif
 };
 
 static void * private_malloc(size_t size, void *ctx)
 {
     struct match_data_state *state = ctx;
+    apr_size_t avail;
 
-    if(size <= (state->buf_len - state->buf_used)) {
+    avail = sizeof(state->buf) - state->buf_used;
+    if (avail >= size) {
         void *p = state->buf + state->buf_used;
-
-        state->buf_used += APR_ALIGN_DEFAULT(size);
-
+        size = APR_ALIGN_DEFAULT(size);
+        if (size > avail) {
+            size = avail;
+        }
+        state->buf_used += size;
         return p;
     }
-    else {
-        return malloc(size);
+
+#if APREG_USE_THREAD_LOCAL
+    if (state->thd) {
+        struct match_thread_state *ts = thread_state;
+        void *p;
+
+        if (!ts) {
+            apr_pool_t *tp = apr_thread_pool_get(state->thd);
+            ts = apr_pcalloc(tp, sizeof(*ts));
+            apr_pool_create(&ts->pool, tp);
+            thread_state = state->ts = ts;
+        }
+        else if (!state->ts) {
+            ts->heap_used = 0;
+            state->ts = ts;
+        }
+
+        avail = ts->heap_size - ts->heap_used;
+        if (avail >= size) {
+            size = APR_ALIGN_DEFAULT(size);
+            if (size > avail) {
+                size = avail;
+            }
+            p = ts->heap + ts->heap_used;
+        }
+        else {
+            ts->heap_size *= 2;
+            size = APR_ALIGN_DEFAULT(size);
+            if (ts->heap_size < size) {
+                ts->heap_size = size;
+            }
+            if (ts->heap_size < AP_PCRE_STACKBUF_SIZE * 2) {
+                ts->heap_size = AP_PCRE_STACKBUF_SIZE * 2;
+            }
+            ts->heap = apr_palloc(ts->pool, ts->heap_size);
+            ts->heap_used = 0;
+            p = ts->heap;
+        }
+
+        ts->heap_used += size;
+        return p;
     }
+#endif
+
+    return malloc(size);
 }
 
 static void private_free(void *block, void *ctx)
 {
     struct match_data_state *state = ctx;
-    void *buf_start = state->buf;
-    void *buf_end = state->buf + state->buf_len;
+    char *p = block;
 
-    if (block >= buf_start && block <= buf_end) {
+    if (p >= state->buf && p < state->buf + sizeof(state->buf)) {
         /* This block allocated from stack buffer. Do nothing. */
+        return;
     }
-    else {
-        free(block);
+
+#if APREG_USE_THREAD_LOCAL
+    if (state->thd) {
+        /* Freed in cleanup_state() eventually. */
+        return;
     }
+#endif
+
+    free(block);
 } 
 
 static APR_INLINE
-match_data_pt alloc_match_data(apr_size_t size,
-                               struct match_data_state *state,
-                               void *stack_buf,
-                               apr_size_t stack_buf_len)
+int setup_state(struct match_data_state *state, apr_uint32_t ncaps)
 {
-    state->buf = stack_buf;
-    state->buf_len = stack_buf_len;
     state->buf_used = 0;
-
-#ifdef HAVE_PCRE2
-    state->pcre2_ctx = pcre2_general_context_create(private_malloc, private_free, state);
-    if (!state->pcre2_ctx) { 
-        return NULL;
-    }
-    state->data = pcre2_match_data_create((int)size, state->pcre2_ctx);
-    if (!state->data) {
-        pcre2_general_context_free(state->pcre2_ctx);
-        return NULL;
-    }
-#else
-    state->data = private_malloc(size * sizeof(int) * 3, state);
+#if APREG_USE_THREAD_LOCAL
+    state->thd = ap_thread_current();
+    state->ts = NULL;
 #endif
 
-    return state->data;
+#ifdef HAVE_PCRE2
+    state->pcre2_ctx = pcre2_general_context_create(private_malloc,
+                                                    private_free, state);
+    if (!state->pcre2_ctx) { 
+        return 0;
+    }
+
+    state->match_data = pcre2_match_data_create(ncaps, state->pcre2_ctx);
+    if (!state->match_data) {
+        pcre2_general_context_free(state->pcre2_ctx);
+        return 0;
+    }
+#else
+    if (ncaps) {
+        state->match_data = private_malloc(ncaps * sizeof(int) * 3, state);
+        if (!state->match_data) {
+            return 0;
+        }
+    }
+    else {
+        /* Fine with PCRE1 */
+        state->match_data = NULL;
+    }
+#endif
+
+    return 1;
 }
 
 static APR_INLINE
-void free_match_data(struct match_data_state *state)
+void cleanup_state(struct match_data_state *state)
 {
 #ifdef HAVE_PCRE2
-    pcre2_match_data_free(state->data);
+    pcre2_match_data_free(state->match_data);
     pcre2_general_context_free(state->pcre2_ctx);
 #else
-    private_free(state->data, state);
+    if (state->match_data) {
+        private_free(state->match_data, state);
+    }
+#endif
+
+#if APREG_USE_THREAD_LOCAL
+    if (state->ts) {
+        /* Let the thread's pool allocator recycle or free according
+         * to its max_free setting.
+         */
+        struct match_thread_state *ts = state->ts;
+        apr_pool_clear(ts->pool);
+        ts->heap_size = 0;
+        ts->heap = NULL;
+    }
 #endif
 }
 
@@ -372,16 +479,12 @@
                                ap_regmatch_t *pmatch, int eflags)
 {
     int rc;
-    int options = 0, to_free = 0;
-    match_vector_pt ovector = NULL;
-    apr_size_t ncaps = (apr_size_t)preg->re_nsub + 1;
+    int options = 0;
     struct match_data_state state;
-    match_data_pt data;
-    /* Use apr_uint64_t to get proper alignment. */
-    apr_uint64_t stack_buf[(AP_PCRE_STACKBUF_SIZE + sizeof(apr_uint64_t) - 1) / sizeof(apr_uint64_t)];
+    match_vector_pt ovector = NULL;
+    apr_uint32_t ncaps = (apr_uint32_t)preg->re_nsub + 1;
 
-    data = alloc_match_data(ncaps, &state, stack_buf, sizeof(stack_buf));
-    if (!data) {
+    if (!setup_state(&state, ncaps)) {
         return AP_REG_ESPACE;
     }
 
@@ -396,11 +499,11 @@
 
 #ifdef HAVE_PCRE2
     rc = pcre2_match((const pcre2_code *)preg->re_pcre,
-                     (const unsigned char *)buff, len,
-                     0, options, data, NULL);
-    ovector = pcre2_get_ovector_pointer(data);
+                     (const unsigned char *)buff, len, 0, options,
+                     state.match_data, NULL);
+    ovector = pcre2_get_ovector_pointer(state.match_data);
 #else
-    ovector = data;
+    ovector = state.match_data;
     rc = pcre_exec((const pcre *)preg->re_pcre, NULL, buff, (int)len,
                    0, options, ovector, ncaps * 3);
 #endif
@@ -415,11 +518,11 @@
         }
         for (; i < nmatch; i++)
             pmatch[i].rm_so = pmatch[i].rm_eo = -1;
-        free_match_data(&state);
+        cleanup_state(&state);
         return 0;
     }
     else {
-        free_match_data(&state);
+        cleanup_state(&state);
 #ifdef HAVE_PCRE2
         if (rc <= PCRE2_ERROR_UTF8_ERR1 && rc >= PCRE2_ERROR_UTF8_ERR21)
             return AP_REG_INVARG;