X-Debbugs-CC: yavor@gnu.org On Thu, 18 Nov 2021 11:49:06 +0000 Matthew Vernon <matthew@debian.org> wrote: > Source: parser > Severity: important > User: matthew-pcredep@debian.org > Usertags: obsolete-pcre3 > > Dear maintainer, > > Your package still depends on the old, obsolete PCRE3[0] libraries > (i.e. libpcre3-dev). This has been end of life for a while now, and > upstream do not intend to fix any further bugs in it. Accordingly, I > would like to remove the pcre3 libraries from Debian, preferably in > time for the release of Bookworm. > > The newer PCRE2 library was first released in 2015, and has been in > Debian since stretch. Upstream's documentation for PCRE2 is available > here: https://pcre.org/current/doc/html/ > > Many large projects that use PCRE have made the switch now (e.g. git, > php); it does involve some work, but we are now at the stage where > PCRE3 should not be used, particularly if it might ever be exposed to > untrusted input. > > This mass bug filing was discussed on debian-devel@ in > https://lists.debian.org/debian-devel/2021/11/msg00176.html > > Regards, > > Matthew [0] Historical reasons mean that old PCRE is packaged as > pcre3 in Debian I am aware of the work at https://bugs.debian.org/1057281 , but unfortunately I am unable to review the patch at the moment. In order to prevent the loss of the proposed patch, I am including it as an email attachment here. Thanks, Boyuan Yang
Description: Port to PCRE2.
Bug-Debian: https://bugs.debian.org/1000006
Author: Yavor Doganov <yavor@gnu.org>
Forwarded: mailto:mailbox@parser.ru
Last-Update: 2023-11-29
---
--- parser-3.4.6.orig/configure.ac
+++ parser-3.4.6/configure.ac
@@ -184,20 +184,20 @@
PCRE_INCLUDES="-I$PCRE/include"
PCRE_LIBS="$PCRE/lib/libpcre.la"
- if test -f $PCRE/include/pcre.h -a -f $PCRE_LIBS; then
+ if test -f $PCRE/include/pcre2.h -a -f $PCRE_LIBS; then
PCRE_OK="yes"
else
- PCRE_LIBS="-L$PCRE/lib -lpcre"
+ PCRE_LIBS="-L$PCRE/lib -lpcre2-8"
fi
if test "$PCRE" = "yes"; then
PCRE=""
- PCRE_LIBS="-lpcre"
+ PCRE_LIBS="-lpcre2-8"
PCRE_INCLUDES=""
AC_MSG_WARN([--with-pcre value was not specified, hoping linker would find it])
fi
],[
- PCRE_LIBS="-lpcre"
+ PCRE_LIBS="-lpcre2-8"
PCRE_INCLUDES=""
AC_MSG_WARN([--with-pcre was not specified, hoping linker would find it])
])
@@ -206,16 +206,21 @@
AC_MSG_CHECKING(for prce)
SAVE_LIBS=$LIBS
LIBS="$LIBS $PCRE_LIBS $PCRE_INCLUDES"
- AC_TRY_LINK([ #include <pcre.h> ],[ const char *v=pcre_version(); ],
- AC_MSG_RESULT(yes)
+ AC_LINK_IFELSE(
+ [AC_LANG_PROGRAM([[#define PCRE2_CODE_UNIT_WIDTH 8
+ #include <pcre2.h>]],
+ [[uint32_t ov=16;
+ pcre2_match_data *md;
+ md=pcre2_match_data_create(ov, NULL);]])],
+ [AC_MSG_RESULT([yes])]
,
- AC_MSG_RESULT(no)
+ [AC_MSG_RESULT([no])
if test -z "$PCRE"; then
AC_MSG_ERROR(please specify path to PCRE: --with-pcre=DIR)
else
AC_MSG_ERROR($PCRE does not seem to be valid PCRE installation directory)
fi
- )
+ ])
LIBS=$SAVE_LIBS
fi
--- parser-3.4.6.orig/src/include/pa_charset.h
+++ parser-3.4.6/src/include/pa_charset.h
@@ -16,7 +16,8 @@
#include "pa_hash.h"
#include "pa_array.h"
-#include "pcre.h"
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
// we are using some pcre_internal.h stuff as well
#include "../lib/pcre/pa_pcre_internal.h"
--- parser-3.4.6.orig/src/lib/pcre/pa_pcre_valid_utf8.c
+++ parser-3.4.6/src/lib/pcre/pa_pcre_valid_utf8.c
@@ -6,7 +6,8 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2012 University of Cambridge
+ Original API code Copyright (c) 1997-2012 University of Cambridge
+ New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -38,112 +39,134 @@
*/
-/* This module contains an internal function for validating UTF-8 character
-strings. */
-
-#include "pcre.h"
+/* This module contains an internal function for validating UTF character
+strings. This file is also #included by the pcre2test program, which uses
+macros to change names from _pcre2_xxx to xxxx, thereby avoiding name clashes
+with the library. In this case, PCRE2_PCRE2TEST is defined. */
+
+#define SUPPORT_UNICODE
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
#include "pa_pcre_internal.h"
+static const uint8_t utf8_table4[] = {
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
+
+#ifndef SUPPORT_UNICODE
+/*************************************************
+* Dummy function when Unicode is not supported *
+*************************************************/
+
+/* This function should never be called when Unicode is not supported. */
+
+int
+PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset)
+{
+(void)string;
+(void)length;
+(void)erroroffset;
+return 0;
+}
+#else /* UTF is supported */
+
+
/*************************************************
-* Validate a UTF-8 string *
+* Validate a UTF string *
*************************************************/
/* This function is called (optionally) at the start of compile or match, to
-check that a supposed UTF-8 string is actually valid. The early check means
+check that a supposed UTF string is actually valid. The early check means
that subsequent code can assume it is dealing with a valid string. The check
can be turned off for maximum performance, but the consequences of supplying an
invalid string are then undefined.
-Originally, this function checked according to RFC 2279, allowing for values in
-the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in
-the canonical format. Once somebody had pointed out RFC 3629 to me (it
-obsoletes 2279), additional restrictions were applied. The values are now
-limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
-subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
-characters is still checked.
-
-From release 8.13 more information about the details of the error are passed
-back in the returned value:
-
-PCRE_UTF8_ERR0 No error
-PCRE_UTF8_ERR1 Missing 1 byte at the end of the string
-PCRE_UTF8_ERR2 Missing 2 bytes at the end of the string
-PCRE_UTF8_ERR3 Missing 3 bytes at the end of the string
-PCRE_UTF8_ERR4 Missing 4 bytes at the end of the string
-PCRE_UTF8_ERR5 Missing 5 bytes at the end of the string
-PCRE_UTF8_ERR6 2nd-byte's two top bits are not 0x80
-PCRE_UTF8_ERR7 3rd-byte's two top bits are not 0x80
-PCRE_UTF8_ERR8 4th-byte's two top bits are not 0x80
-PCRE_UTF8_ERR9 5th-byte's two top bits are not 0x80
-PCRE_UTF8_ERR10 6th-byte's two top bits are not 0x80
-PCRE_UTF8_ERR11 5-byte character is not permitted by RFC 3629
-PCRE_UTF8_ERR12 6-byte character is not permitted by RFC 3629
-PCRE_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted
-PCRE_UTF8_ERR14 3-byte character with value 0xd000-0xdfff is not permitted
-PCRE_UTF8_ERR15 Overlong 2-byte sequence
-PCRE_UTF8_ERR16 Overlong 3-byte sequence
-PCRE_UTF8_ERR17 Overlong 4-byte sequence
-PCRE_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur)
-PCRE_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur)
-PCRE_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character)
-PCRE_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff
-
Arguments:
string points to the string
- length length of string, or -1 if the string is zero-terminated
+ length length of string
errp pointer to an error position offset variable
-Returns: = 0 if the string is a valid UTF-8 string
- > 0 otherwise, setting the offset of the bad character
+Returns: == 0 if the string is a valid UTF string
+ != 0 otherwise, setting the offset of the bad character
*/
-typedef unsigned char *PCRE_PUCHAR;
-#define SUPPORT_UTF
+int
+pa_pcre_valid_utf(PCRE2_SPTR string, int length, int *erroroffset)
+{
+PCRE2_SPTR p;
+uint32_t c;
-static const unsigned char utf8_table4[] = {
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
+/* ----------------- Check a UTF-8 string ----------------- */
+#if PCRE2_CODE_UNIT_WIDTH == 8
-int
-pa_pcre_valid_utf(PCRE_PUCHAR string, int length, int *erroroffset)
-{
-#ifdef SUPPORT_UTF
-register PCRE_PUCHAR p;
+/* Originally, this function checked according to RFC 2279, allowing for values
+in the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were
+in the canonical format. Once somebody had pointed out RFC 3629 to me (it
+obsoletes 2279), additional restrictions were applied. The values are now
+limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
+subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
+characters is still checked. Error returns are as follows:
-if (length < 0)
- {
- for (p = string; *p != 0; p++);
- length = (int)(p - string);
- }
+PCRE2_ERROR_UTF8_ERR1 Missing 1 byte at the end of the string
+PCRE2_ERROR_UTF8_ERR2 Missing 2 bytes at the end of the string
+PCRE2_ERROR_UTF8_ERR3 Missing 3 bytes at the end of the string
+PCRE2_ERROR_UTF8_ERR4 Missing 4 bytes at the end of the string
+PCRE2_ERROR_UTF8_ERR5 Missing 5 bytes at the end of the string
+PCRE2_ERROR_UTF8_ERR6 2nd-byte's two top bits are not 0x80
+PCRE2_ERROR_UTF8_ERR7 3rd-byte's two top bits are not 0x80
+PCRE2_ERROR_UTF8_ERR8 4th-byte's two top bits are not 0x80
+PCRE2_ERROR_UTF8_ERR9 5th-byte's two top bits are not 0x80
+PCRE2_ERROR_UTF8_ERR10 6th-byte's two top bits are not 0x80
+PCRE2_ERROR_UTF8_ERR11 5-byte character is not permitted by RFC 3629
+PCRE2_ERROR_UTF8_ERR12 6-byte character is not permitted by RFC 3629
+PCRE2_ERROR_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted
+PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd800-0xdfff is not permitted
+PCRE2_ERROR_UTF8_ERR15 Overlong 2-byte sequence
+PCRE2_ERROR_UTF8_ERR16 Overlong 3-byte sequence
+PCRE2_ERROR_UTF8_ERR17 Overlong 4-byte sequence
+PCRE2_ERROR_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur)
+PCRE2_ERROR_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur)
+PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character)
+PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff
+*/
-for (p = string; length-- > 0; p++)
+for (p = string; length > 0; p++)
{
- register int ab, c, d;
+ uint32_t ab, d;
c = *p;
+ length--;
+
if (c < 128) continue; /* ASCII character */
if (c < 0xc0) /* Isolated 10xx xxxx byte */
{
- *erroroffset = (int)(p - string);
- return PCRE_UTF8_ERR20;
+ *erroroffset = p - string;
+ return PCRE2_ERROR_UTF8_ERR20;
}
if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */
{
- *erroroffset = (int)(p - string);
- return PCRE_UTF8_ERR21;
+ *erroroffset = p - string;
+ return PCRE2_ERROR_UTF8_ERR21;
}
- ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
- if (length < ab)
+ ab = utf8_table4[c & 0x3f]; /* Number of additional bytes (1-5) */
+ if (length < ab) /* Missing bytes */
{
- *erroroffset = (int)(p - string); /* Missing bytes */
- return ab - length; /* Codes ERR1 to ERR5 */
+ *erroroffset = p - string;
+ switch(ab - length)
+ {
+ case 1: return PCRE2_ERROR_UTF8_ERR1;
+ case 2: return PCRE2_ERROR_UTF8_ERR2;
+ case 3: return PCRE2_ERROR_UTF8_ERR3;
+ case 4: return PCRE2_ERROR_UTF8_ERR4;
+ case 5: return PCRE2_ERROR_UTF8_ERR5;
+ }
}
length -= ab; /* Length remaining */
@@ -152,7 +175,7 @@
if (((d = *(++p)) & 0xc0) != 0x80)
{
*erroroffset = (int)(p - string) - 1;
- return PCRE_UTF8_ERR6;
+ return PCRE2_ERROR_UTF8_ERR6;
}
/* For each length, check that the remaining bytes start with the 0x80 bit
@@ -167,7 +190,7 @@
case 1: if ((c & 0x3e) == 0)
{
*erroroffset = (int)(p - string) - 1;
- return PCRE_UTF8_ERR15;
+ return PCRE2_ERROR_UTF8_ERR15;
}
break;
@@ -179,17 +202,17 @@
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
- return PCRE_UTF8_ERR7;
+ return PCRE2_ERROR_UTF8_ERR7;
}
if (c == 0xe0 && (d & 0x20) == 0)
{
*erroroffset = (int)(p - string) - 2;
- return PCRE_UTF8_ERR16;
+ return PCRE2_ERROR_UTF8_ERR16;
}
if (c == 0xed && d >= 0xa0)
{
*erroroffset = (int)(p - string) - 2;
- return PCRE_UTF8_ERR14;
+ return PCRE2_ERROR_UTF8_ERR14;
}
break;
@@ -201,22 +224,22 @@
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
- return PCRE_UTF8_ERR7;
+ return PCRE2_ERROR_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
*erroroffset = (int)(p - string) - 3;
- return PCRE_UTF8_ERR8;
+ return PCRE2_ERROR_UTF8_ERR8;
}
if (c == 0xf0 && (d & 0x30) == 0)
{
*erroroffset = (int)(p - string) - 3;
- return PCRE_UTF8_ERR17;
+ return PCRE2_ERROR_UTF8_ERR17;
}
if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
{
*erroroffset = (int)(p - string) - 3;
- return PCRE_UTF8_ERR13;
+ return PCRE2_ERROR_UTF8_ERR13;
}
break;
@@ -232,22 +255,22 @@
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
- return PCRE_UTF8_ERR7;
+ return PCRE2_ERROR_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
*erroroffset = (int)(p - string) - 3;
- return PCRE_UTF8_ERR8;
+ return PCRE2_ERROR_UTF8_ERR8;
}
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
{
*erroroffset = (int)(p - string) - 4;
- return PCRE_UTF8_ERR9;
+ return PCRE2_ERROR_UTF8_ERR9;
}
if (c == 0xf8 && (d & 0x38) == 0)
{
*erroroffset = (int)(p - string) - 4;
- return PCRE_UTF8_ERR18;
+ return PCRE2_ERROR_UTF8_ERR18;
}
break;
@@ -258,27 +281,27 @@
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
- return PCRE_UTF8_ERR7;
+ return PCRE2_ERROR_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
*erroroffset = (int)(p - string) - 3;
- return PCRE_UTF8_ERR8;
+ return PCRE2_ERROR_UTF8_ERR8;
}
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
{
*erroroffset = (int)(p - string) - 4;
- return PCRE_UTF8_ERR9;
+ return PCRE2_ERROR_UTF8_ERR9;
}
if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */
{
*erroroffset = (int)(p - string) - 5;
- return PCRE_UTF8_ERR10;
+ return PCRE2_ERROR_UTF8_ERR10;
}
if (c == 0xfc && (d & 0x3c) == 0)
{
*erroroffset = (int)(p - string) - 5;
- return PCRE_UTF8_ERR19;
+ return PCRE2_ERROR_UTF8_ERR19;
}
break;
}
@@ -290,16 +313,89 @@
if (ab > 3)
{
*erroroffset = (int)(p - string) - ab;
- return (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12;
+ return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12;
+ }
+ }
+return 0;
+
+
+/* ----------------- Check a UTF-16 string ----------------- */
+
+#elif PCRE2_CODE_UNIT_WIDTH == 16
+
+/* There's not so much work, nor so many errors, for UTF-16.
+PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at the end of the string
+PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate
+PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate
+*/
+
+for (p = string; length > 0; p++)
+ {
+ c = *p;
+ length--;
+
+ if ((c & 0xf800) != 0xd800)
+ {
+ /* Normal UTF-16 code point. Neither high nor low surrogate. */
+ }
+ else if ((c & 0x0400) == 0)
+ {
+ /* High surrogate. Must be a followed by a low surrogate. */
+ if (length == 0)
+ {
+ *erroroffset = p - string;
+ return PCRE2_ERROR_UTF16_ERR1;
+ }
+ p++;
+ length--;
+ if ((*p & 0xfc00) != 0xdc00)
+ {
+ *erroroffset = p - string - 1;
+ return PCRE2_ERROR_UTF16_ERR2;
+ }
+ }
+ else
+ {
+ /* Isolated low surrogate. Always an error. */
+ *erroroffset = p - string;
+ return PCRE2_ERROR_UTF16_ERR3;
}
}
+return 0;
+
-#else /* SUPPORT_UTF */
-(void)(string); /* Keep picky compilers happy */
-(void)(length);
-#endif
-return PCRE_UTF8_ERR0; /* This indicates success */
+/* ----------------- Check a UTF-32 string ----------------- */
+
+#else
+
+/* There is very little to do for a UTF-32 string.
+PCRE2_ERROR_UTF32_ERR1 Surrogate character
+PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff
+*/
+
+for (p = string; length > 0; length--, p++)
+ {
+ c = *p;
+ if ((c & 0xfffff800u) != 0xd800u)
+ {
+ /* Normal UTF-32 code point. Neither high nor low surrogate. */
+ if (c > 0x10ffffu)
+ {
+ *erroroffset = p - string;
+ return PCRE2_ERROR_UTF32_ERR2;
+ }
+ }
+ else
+ {
+ /* A surrogate */
+ *erroroffset = p - string;
+ return PCRE2_ERROR_UTF32_ERR1;
+ }
+ }
+return 0;
+#endif /* CODE_UNIT_WIDTH */
}
+#endif /* SUPPORT_UNICODE */
-/* End of pcre_valid_utf8.c */
+/* End of pcre2_valid_utf.c */
--- parser-3.4.6.orig/src/main/pa_common.C
+++ parser-3.4.6/src/main/pa_common.C
@@ -12,7 +12,8 @@
#include "pa_charsets.h"
#include "pa_http.h"
#include "pa_request_charsets.h"
-#include "pcre.h"
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
#include "pa_request.h"
#include "pa_idna.h"
--- parser-3.4.6.orig/src/main/pa_globals.C
+++ parser-3.4.6/src/main/pa_globals.C
@@ -26,7 +26,6 @@
#include "pa_cache_managers.h"
#include "ltdl.h"
-#include "pcre.h"
volatile const char * IDENT_PA_GLOBALS_C="$Id: pa_globals.C,v 1.212 2021/01/16 15:47:05 moko Exp $" IDENT_PA_GLOBALS_H IDENT_PA_SAPI_H;
@@ -206,10 +205,6 @@
#endif
- // pcre
- pcre_malloc=pa_malloc;
- pcre_free=pa_free;
-
// cord
CORD_oom_fn=pa_CORD_oom_fn;
}
--- parser-3.4.6.orig/src/types/pa_vregex.h
+++ parser-3.4.6/src/types/pa_vregex.h
@@ -16,7 +16,8 @@
#include "pa_common.h"
#include "pa_vstateless_object.h"
#include "pa_charset.h"
-#include "pcre.h"
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
// defines
@@ -62,26 +63,27 @@
fpattern(0),
foptions_cstr(0),
fcode(0),
- fextra(0),
- fstudied(false)
+ fgen_ctxt(0),
+ fcmp_ctxt(0),
+ fmatch_ctxt(0),
+ fmatch_data(0)
{
foptions[0]=0;
foptions[1]=0;
}
- VRegex(Charset& acharset, const String* aregex, const String* aoptions):
- fextra(0),
- fstudied(false)
+ VRegex(Charset& acharset, const String* aregex, const String* aoptions)
{
set(acharset, aregex, aoptions);
compile();
}
~VRegex(){
- if(fextra)
- pcre_free(fextra);
- if(fcode)
- pcre_free(fcode);
+ pcre2_code_free(fcode);
+ pcre2_match_data_free(fmatch_data);
+ pcre2_match_context_free(fmatch_ctxt);
+ pcre2_compile_context_free(fcmp_ctxt);
+ pcre2_general_context_free(fgen_ctxt);
}
void set(Charset& acharset, const String* aregex, const String* aoptions);
@@ -89,18 +91,16 @@
void compile();
- void study();
-
- int exec(const char* string, size_t string_len, int* ovector, int ovector_size, int prestart=0);
+ int exec(const char* string, size_t string_len, int prestart=0);
// size_t info();
+ size_t* get_ovector_ptr();
+
size_t full_info(int type);
size_t get_info_size();
- size_t get_study_size();
-
size_t get_options();
bool is_pre_post_match_needed(){
@@ -120,13 +120,15 @@
private:
Charset* fcharset;
- const char* fpattern;
+ PCRE2_SPTR fpattern;
const char* foptions_cstr;
int foptions[2];
- pcre* fcode;
- pcre_extra* fextra;
- bool fstudied;
+ pcre2_code* fcode;
+ pcre2_general_context* fgen_ctxt;
+ pcre2_compile_context* fcmp_ctxt;
+ pcre2_match_context* fmatch_ctxt;
+ pcre2_match_data* fmatch_data;
};
--- parser-3.4.6.orig/src/classes/file.C
+++ parser-3.4.6/src/classes/file.C
@@ -711,7 +711,6 @@
} else if(vfilter->is_string()) {
if(!vfilter->get_string()->trim().is_empty()) {
vregex=new VRegex(r.charsets.source(), &vfilter->as_string(), 0/*options*/);
- vregex->study();
vrcleaner.vregex=vregex;
}
} else {
@@ -726,14 +725,11 @@
Table::Action_options table_options;
Table& table=*new Table(file_list_table_template, table_options);
- const int ovector_size=(1/*match*/)*3;
- int ovector[ovector_size];
-
LOAD_DIR(absolute_path_cstr,
const char* file_name_cstr=ffblk.name();
size_t file_name_size=strlen(file_name_cstr);
- if(!vregex || vregex->exec(file_name_cstr, file_name_size, ovector, ovector_size)>=0) {
+ if(!vregex || vregex->exec(file_name_cstr, file_name_size)>=0) {
Table::element_type row(new ArrayString);
*row+=new String(pa_strdup(file_name_cstr, file_name_size), String::L_TAINTED);
*row+=new String(String::Body::Format(ffblk.is_dir(stat) ? 1 : 0), String::L_CLEAN);
--- parser-3.4.6.orig/src/main/pa_string.C
+++ parser-3.4.6/src/main/pa_string.C
@@ -652,8 +652,7 @@
const char* subject=cstr();
size_t subject_length=length();
- const int ovector_size=(1/*match*/+MAX_MATCH_GROUPS)*3;
- int ovector[ovector_size];
+ size_t* ovector;
Table::Action_options table_options;
Table& table=*new Table(string_match_table_template, table_options);
@@ -662,11 +661,12 @@
int poststart=0;
int postfinish=length();
while(true) {
- int exec_result=vregex->exec(subject, subject_length, ovector, ovector_size, prestart);
+ int exec_result=vregex->exec(subject, subject_length, prestart);
if(exec_result<0) // only PCRE_ERROR_NOMATCH might be here, other negative results cause an exception
break;
+ ovector=vregex->get_ovector_ptr();
int prefinish=ovector[0];
poststart=ovector[1];
--- parser-3.4.6.orig/src/types/pa_vregex.C
+++ parser-3.4.6/src/types/pa_vregex.C
@@ -19,8 +19,28 @@
const char* get_pcre_exec_error_text(int exec_result){
switch(exec_result){
- case PCRE_ERROR_BADUTF8:
- case PCRE_ERROR_BADUTF8_OFFSET:
+ case PCRE2_ERROR_UTF8_ERR1:
+ case PCRE2_ERROR_UTF8_ERR2:
+ case PCRE2_ERROR_UTF8_ERR3:
+ case PCRE2_ERROR_UTF8_ERR4:
+ case PCRE2_ERROR_UTF8_ERR5:
+ case PCRE2_ERROR_UTF8_ERR6:
+ case PCRE2_ERROR_UTF8_ERR7:
+ case PCRE2_ERROR_UTF8_ERR8:
+ case PCRE2_ERROR_UTF8_ERR9:
+ case PCRE2_ERROR_UTF8_ERR10:
+ case PCRE2_ERROR_UTF8_ERR11:
+ case PCRE2_ERROR_UTF8_ERR12:
+ case PCRE2_ERROR_UTF8_ERR13:
+ case PCRE2_ERROR_UTF8_ERR14:
+ case PCRE2_ERROR_UTF8_ERR15:
+ case PCRE2_ERROR_UTF8_ERR16:
+ case PCRE2_ERROR_UTF8_ERR17:
+ case PCRE2_ERROR_UTF8_ERR18:
+ case PCRE2_ERROR_UTF8_ERR19:
+ case PCRE2_ERROR_UTF8_ERR20:
+ case PCRE2_ERROR_UTF8_ERR21:
+ case PCRE2_ERROR_BADUTFOFFSET:
return "UTF-8 validation failed during pcre_exec (%d).";
break;
default:
@@ -28,6 +48,15 @@
}
}
+static void*
+pa_pcre_malloc(size_t size, void *ptr){
+ return pa_malloc(size);
+}
+
+static void
+pa_pcre_free(void *ptr, void *tag){
+ pa_free(ptr);
+}
Value& VRegex::as_expr_result() {
return *new VInt(as_int());
@@ -41,19 +70,18 @@
int set;
int *result;
} regex_option[]={
- {"i", "I", 0, PCRE_CASELESS, result}, // a=A
- {"s", "S", 0, PCRE_DOTALL, result}, // ^\n\n$ [default]
- {"m", "M", PCRE_DOTALL, PCRE_MULTILINE, result}, // ^aaa\n$^bbb\n$
- {"x", 0, 0, PCRE_EXTENDED, result}, // whitespace in regex ignored
- {"U", 0, 0, PCRE_UNGREEDY, result}, // ungreedy patterns (greedy by default)
+ {"i", "I", 0, PCRE2_CASELESS, result}, // a=A
+ {"s", "S", 0, PCRE2_DOTALL, result}, // ^\n\n$ [default]
+ {"m", "M", PCRE2_DOTALL, PCRE2_MULTILINE, result}, // ^aaa\n$^bbb\n$
+ {"x", 0, 0, PCRE2_EXTENDED, result}, // whitespace in regex ignored
+ {"U", 0, 0, PCRE2_UNGREEDY, result}, // ungreedy patterns (greedy by default)
{"g", "G", 0, MF_GLOBAL_SEARCH, result+1}, // many rows
{"'", 0, 0, MF_NEED_PRE_POST_MATCH, result+1},
{"n", 0, 0, MF_JUST_COUNT_MATCHES, result+1},
{0, 0, 0, 0, 0}
};
- result[0]=PCRE_EXTRA /* backslash+non-special char causes error */
- | PCRE_DOTALL /* dot matches all chars including newline char */
- | PCRE_DOLLAR_ENDONLY /* dollar matches only end of string, but not newline chars */;
+ result[0]=PCRE2_DOTALL /* dot matches all chars including newline char */
+ | PCRE2_DOLLAR_ENDONLY /* dollar matches only end of string, but not newline chars */;
result[1]=0;
if(options && !options->is_empty()){
@@ -79,7 +107,7 @@
fcharset=&acharset;
- fpattern=aregex->untaint_cstr(String::L_REGEX);
+ fpattern=reinterpret_cast<const unsigned char*>(aregex->untaint_cstr(String::L_REGEX));
foptions_cstr=aoptions ? aoptions->cstr() : 0;
@@ -99,34 +127,49 @@
void VRegex::compile(){
- const char* err_ptr;
- int err_offset;
+ int err;
+ size_t err_offset;
int options=foptions[0];
// @todo (for UTF-8): check string & pattern and use PCRE_NO_UTF8_CHECK option
if(fcharset->isUTF8())
- options |= (PCRE_UTF8 | PCRE_UCP);
+ options |= (PCRE2_UTF | PCRE2_UCP);
+
+ if(!fgen_ctxt)
+ fgen_ctxt=pcre2_general_context_create(pa_pcre_malloc, pa_pcre_free, NULL);
+
+ if(!fcmp_ctxt)
+ fcmp_ctxt=pcre2_compile_context_create(fgen_ctxt);
- fcode=pcre_compile(fpattern, options,
- &err_ptr, &err_offset,
- fcharset->pcre_tables);
+ pcre2_set_character_tables(fcmp_ctxt, fcharset->pcre_tables);
+ fcode=pcre2_compile(fpattern, PCRE2_ZERO_TERMINATED, options,
+ &err, &err_offset,
+ fcmp_ctxt);
if(!fcode){
+ PCRE2_UCHAR buffer[120];
+
+ pcre2_get_error_message(err, buffer, sizeof(buffer));
throw Exception(PCRE_EXCEPTION_TYPE,
- new String(fpattern+err_offset, String::L_TAINTED),
- "regular expression syntax error - %s", err_ptr);
+ new String(reinterpret_cast<const char*>(fpattern+err_offset), String::L_TAINTED),
+ "regular expression syntax error - %s", buffer);
}
}
+size_t* VRegex::get_ovector_ptr(){
+ return pcre2_get_ovector_pointer(fmatch_data);
+}
+
+
size_t VRegex::full_info(int type){
size_t result;
- int fullinfo_result=pcre_fullinfo(fcode, fextra, type, &result);
+ int fullinfo_result=pcre2_pattern_info(fcode, type, &result);
if(fullinfo_result<0){
throw Exception(PCRE_EXCEPTION_TYPE,
- new String(fpattern, String::L_TAINTED),
- "pcre_full_info error (%d)", fullinfo_result);
+ new String(reinterpret_cast<const char*>(fpattern), String::L_TAINTED),
+ "pcre2_pattern_info error (%d)", fullinfo_result);
}
return result;
@@ -134,39 +177,24 @@
size_t VRegex::get_info_size(){
- return full_info(PCRE_INFO_SIZE);
+ return full_info(PCRE2_INFO_SIZE);
}
-size_t VRegex::get_study_size(){
- return full_info(PCRE_INFO_STUDYSIZE);
-}
-
-void VRegex::study(){
- if(fstudied)
- return;
-
- const char* err_ptr;
- fextra=pcre_study(fcode, 0/*options*/, &err_ptr);
-
- if(err_ptr){
- throw Exception(PCRE_EXCEPTION_TYPE,
- new String(fpattern, String::L_TAINTED),
- "pcre_study error: %s", err_ptr);
- }
-
- fstudied=true;
-}
+int VRegex::exec(const char* string, size_t string_len, int prestart){
+ if(!fmatch_data)
+ fmatch_data=pcre2_match_data_create_from_pattern(fcode, fgen_ctxt);
+ if(!fmatch_ctxt)
+ fmatch_ctxt=pcre2_match_context_create(fgen_ctxt);
-int VRegex::exec(const char* string, size_t string_len, int* ovector, int ovector_size, int prestart){
- int result=pcre_exec(fcode, fextra,
- string, string_len, prestart,
- prestart>0 ? PCRE_NO_UTF8_CHECK : 0, ovector, ovector_size);
+ int result=pcre2_match(fcode,
+ reinterpret_cast<const unsigned char*>(string), string_len, prestart,
+ prestart>0 ? PCRE2_NO_UTF_CHECK : 0, fmatch_data, fmatch_ctxt);
- if(result<0 && result!=PCRE_ERROR_NOMATCH){
+ if(result<0 && result!=PCRE2_ERROR_NOMATCH){
throw Exception(PCRE_EXCEPTION_TYPE,
- new String(fpattern, String::L_TAINTED),
+ new String(reinterpret_cast<const char*>(fpattern), String::L_TAINTED),
get_pcre_exec_error_text(result), result);
}
@@ -176,7 +204,7 @@
Value* VRegex::get_element(const String& aname) {
if(aname == REGEX_PATTERN_NAME)
- return new VString(*new String(fpattern, String::L_TAINTED));
+ return new VString(*new String(reinterpret_cast<const char*>(fpattern), String::L_TAINTED));
if(aname == REGEX_OPTIONS_NAME)
return new VString(*new String(foptions_cstr, String::L_TAINTED));
--- parser-3.4.6.orig/src/types/pa_vmail.C
+++ parser-3.4.6/src/types/pa_vmail.C
@@ -484,7 +484,7 @@
size_t mail_header_utf8_substring(const char *mail, size_t sub_length, size_t length){
int error_offset;
if(int error_code=pa_pcre_valid_utf((unsigned char *)mail, sub_length, &error_offset)){
- if(error_code<PCRE_UTF8_ERR6){ // Missing X byte at the end of the string errors
+ if(error_code<PCRE2_ERROR_UTF8_ERR6){ // Missing X byte at the end of the string errors
sub_length+=error_code; // adding X bytes
return sub_length < length ? sub_length : length;
}
--- parser-3.4.6.orig/src/lib/pcre/pa_pcre_internal.h
+++ parser-3.4.6/src/lib/pcre/pa_pcre_internal.h
@@ -46,8 +46,8 @@
for this function is in the pcre_valid_utf8.c module. */
#ifdef __cplusplus
- extern "C" int pa_pcre_valid_utf(unsigned char *string, int length, int *erroroffset);
+ extern "C" int pa_pcre_valid_utf(const unsigned char *string, int length, int *erroroffset);
#else
- extern int pa_pcre_valid_utf(unsigned char *string, int length, int *erroroffset);
+ extern int pa_pcre_valid_utf(const unsigned char *string, int length, int *erroroffset);
#endif
--- parser-3.4.6.orig/src/classes/regex.C
+++ parser-3.4.6/src/classes/regex.C
@@ -40,7 +40,6 @@
}
vregex.compile();
- vregex.study();
}
@@ -49,11 +48,6 @@
r.write(*new VInt(vregex.get_info_size()));
}
-static void _study_size(Request& r, MethodParams&) {
- VRegex& vregex=GET_SELF(r, VRegex);
- r.write(*new VInt(vregex.get_study_size()));
-}
-
// constructor
MRegex::MRegex(): Methoded("regex") {
@@ -63,8 +57,5 @@
// ^regex.info_size[]
add_native_method("size", Method::CT_DYNAMIC, _size, 0, 0);
- // ^regex.study_size[]
- add_native_method("study_size", Method::CT_DYNAMIC, _study_size, 0, 0);
-
}
--- parser-3.4.6.orig/src/classes/string.C
+++ parser-3.4.6/src/classes/string.C
@@ -223,7 +223,6 @@
static void split_list(Value& delim_value, const String& string, ArrayString& result) {
if(Value* value=delim_value.as(VREGEX_TYPE)){
VRegex *vregex=static_cast<VRegex*>(value);
- vregex->study();
int matches_count=0;
Split_action_info ai = { string, result };
@@ -396,7 +395,6 @@
vregex=static_cast<VRegex*>(value);
} else {
vregex=new VRegex(r.charsets.source(), ®exp.as_string(), (options) ? (&options->as_string()) : 0);
- vregex->study();
vrcleaner.vregex=vregex;
}
Attachment:
signature.asc
Description: This is a digitally signed message part