Bug#1000006: parser: depends on obsolete pcre3 library

To: 1000006@bugs.debian.org
Subject: Bug#1000006: parser: depends on obsolete pcre3 library
From: Boyuan Yang <byang@debian.org>
Date: Wed, 06 Dec 2023 14:40:05 -0500
Message-id: <[🔎] 01607b908dba7b2515ba70d2885a77402dd50057.camel@debian.org>
Reply-to: Boyuan Yang <byang@debian.org>, 1000006@bugs.debian.org
In-reply-to: <E1mnfuc-0000oB-9d@aragorn.weathertop.principate.org.uk>
References: <E1mnfuc-0000oB-9d@aragorn.weathertop.principate.org.uk> <E1mnfuc-0000oB-9d@aragorn.weathertop.principate.org.uk>

X-Debbugs-CC: yavor@gnu.org

On Thu, 18 Nov 2021 11:49:06 +0000 Matthew Vernon <matthew@debian.org> wrote:
> Source: parser
> Severity: important
> User: matthew-pcredep@debian.org
> Usertags: obsolete-pcre3
> 
> Dear maintainer,
> 
> Your package still depends on the old, obsolete PCRE3[0] libraries
> (i.e. libpcre3-dev). This has been end of life for a while now, and
> upstream do not intend to fix any further bugs in it. Accordingly, I
> would like to remove the pcre3 libraries from Debian, preferably in
> time for the release of Bookworm.
> 
> The newer PCRE2 library was first released in 2015, and has been in
> Debian since stretch. Upstream's documentation for PCRE2 is available
> here: https://pcre.org/current/doc/html/
> 
> Many large projects that use PCRE have made the switch now (e.g. git,
> php); it does involve some work, but we are now at the stage where
> PCRE3 should not be used, particularly if it might ever be exposed to
> untrusted input.
> 
> This mass bug filing was discussed on debian-devel@ in
> https://lists.debian.org/debian-devel/2021/11/msg00176.html
> 
> Regards,
> 
> Matthew [0] Historical reasons mean that old PCRE is packaged as
> pcre3 in Debian 

I am aware of the work at https://bugs.debian.org/1057281 , but
unfortunately I am unable to review the patch at the moment. In order
to prevent the loss of the proposed patch, I am including it as an email
attachment here.

Thanks,
Boyuan Yang

Description: Port to PCRE2.
Bug-Debian: https://bugs.debian.org/1000006
Author: Yavor Doganov <yavor@gnu.org>
Forwarded: mailto:mailbox@parser.ru
Last-Update: 2023-11-29
---

--- parser-3.4.6.orig/configure.ac
+++ parser-3.4.6/configure.ac
@@ -184,20 +184,20 @@
 	PCRE_INCLUDES="-I$PCRE/include"
 	PCRE_LIBS="$PCRE/lib/libpcre.la"
 
-	if test -f $PCRE/include/pcre.h -a -f $PCRE_LIBS; then
+	if test -f $PCRE/include/pcre2.h -a -f $PCRE_LIBS; then
 		PCRE_OK="yes"
 	else
-		PCRE_LIBS="-L$PCRE/lib -lpcre"
+		PCRE_LIBS="-L$PCRE/lib -lpcre2-8"
 	fi
 
 	if test "$PCRE" = "yes"; then
 		PCRE=""
-		PCRE_LIBS="-lpcre"
+		PCRE_LIBS="-lpcre2-8"
 		PCRE_INCLUDES=""
 		AC_MSG_WARN([--with-pcre value was not specified, hoping linker would find it])
 	fi
 ],[
-	PCRE_LIBS="-lpcre"
+	PCRE_LIBS="-lpcre2-8"
 	PCRE_INCLUDES=""
 	AC_MSG_WARN([--with-pcre was not specified, hoping linker would find it])
 ])
@@ -206,16 +206,21 @@
 	AC_MSG_CHECKING(for prce)
 	SAVE_LIBS=$LIBS
 	LIBS="$LIBS $PCRE_LIBS $PCRE_INCLUDES"
-	AC_TRY_LINK([ #include <pcre.h> ],[ const char *v=pcre_version(); ],
-		AC_MSG_RESULT(yes)
+	AC_LINK_IFELSE(
+          [AC_LANG_PROGRAM([[#define PCRE2_CODE_UNIT_WIDTH 8
+                             #include <pcre2.h>]],
+             [[uint32_t ov=16;
+               pcre2_match_data *md;
+               md=pcre2_match_data_create(ov, NULL);]])],
+	  [AC_MSG_RESULT([yes])]
 	,
-		AC_MSG_RESULT(no)
+	  [AC_MSG_RESULT([no])
 		if test -z "$PCRE"; then
 			AC_MSG_ERROR(please specify path to PCRE: --with-pcre=DIR)
 		else
 			AC_MSG_ERROR($PCRE does not seem to be valid PCRE installation directory)
 		fi
-	)
+	])
 	LIBS=$SAVE_LIBS
 fi
 
--- parser-3.4.6.orig/src/include/pa_charset.h
+++ parser-3.4.6/src/include/pa_charset.h
@@ -16,7 +16,8 @@
 #include "pa_hash.h"
 #include "pa_array.h"
 
-#include "pcre.h"
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
 // we are using some pcre_internal.h stuff as well
 #include "../lib/pcre/pa_pcre_internal.h"
 
--- parser-3.4.6.orig/src/lib/pcre/pa_pcre_valid_utf8.c
+++ parser-3.4.6/src/lib/pcre/pa_pcre_valid_utf8.c
@@ -6,7 +6,8 @@
 and semantics are as close as possible to those of the Perl 5 language.
 
                        Written by Philip Hazel
-           Copyright (c) 1997-2012 University of Cambridge
+     Original API code Copyright (c) 1997-2012 University of Cambridge
+          New API code Copyright (c) 2016-2020 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -38,112 +39,134 @@
 */
 
 
-/* This module contains an internal function for validating UTF-8 character
-strings. */
-
-#include "pcre.h"
+/* This module contains an internal function for validating UTF character
+strings. This file is also #included by the pcre2test program, which uses
+macros to change names from _pcre2_xxx to xxxx, thereby avoiding name clashes
+with the library. In this case, PCRE2_PCRE2TEST is defined. */
+
+#define SUPPORT_UNICODE
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
 #include "pa_pcre_internal.h"
 
+static const uint8_t utf8_table4[] = {
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
+
+#ifndef SUPPORT_UNICODE
+/*************************************************
+*  Dummy function when Unicode is not supported  *
+*************************************************/
+
+/* This function should never be called when Unicode is not supported. */
+
+int
+PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset)
+{
+(void)string;
+(void)length;
+(void)erroroffset;
+return 0;
+}
+#else  /* UTF is supported */
+
+
 
 /*************************************************
-*         Validate a UTF-8 string                *
+*           Validate a UTF string                *
 *************************************************/
 
 /* This function is called (optionally) at the start of compile or match, to
-check that a supposed UTF-8 string is actually valid. The early check means
+check that a supposed UTF string is actually valid. The early check means
 that subsequent code can assume it is dealing with a valid string. The check
 can be turned off for maximum performance, but the consequences of supplying an
 invalid string are then undefined.
 
-Originally, this function checked according to RFC 2279, allowing for values in
-the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in
-the canonical format. Once somebody had pointed out RFC 3629 to me (it
-obsoletes 2279), additional restrictions were applied. The values are now
-limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
-subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
-characters is still checked.
-
-From release 8.13 more information about the details of the error are passed
-back in the returned value:
-
-PCRE_UTF8_ERR0   No error
-PCRE_UTF8_ERR1   Missing 1 byte at the end of the string
-PCRE_UTF8_ERR2   Missing 2 bytes at the end of the string
-PCRE_UTF8_ERR3   Missing 3 bytes at the end of the string
-PCRE_UTF8_ERR4   Missing 4 bytes at the end of the string
-PCRE_UTF8_ERR5   Missing 5 bytes at the end of the string
-PCRE_UTF8_ERR6   2nd-byte's two top bits are not 0x80
-PCRE_UTF8_ERR7   3rd-byte's two top bits are not 0x80
-PCRE_UTF8_ERR8   4th-byte's two top bits are not 0x80
-PCRE_UTF8_ERR9   5th-byte's two top bits are not 0x80
-PCRE_UTF8_ERR10  6th-byte's two top bits are not 0x80
-PCRE_UTF8_ERR11  5-byte character is not permitted by RFC 3629
-PCRE_UTF8_ERR12  6-byte character is not permitted by RFC 3629
-PCRE_UTF8_ERR13  4-byte character with value > 0x10ffff is not permitted
-PCRE_UTF8_ERR14  3-byte character with value 0xd000-0xdfff is not permitted
-PCRE_UTF8_ERR15  Overlong 2-byte sequence
-PCRE_UTF8_ERR16  Overlong 3-byte sequence
-PCRE_UTF8_ERR17  Overlong 4-byte sequence
-PCRE_UTF8_ERR18  Overlong 5-byte sequence (won't ever occur)
-PCRE_UTF8_ERR19  Overlong 6-byte sequence (won't ever occur)
-PCRE_UTF8_ERR20  Isolated 0x80 byte (not within UTF-8 character)
-PCRE_UTF8_ERR21  Byte with the illegal value 0xfe or 0xff
-
 Arguments:
   string       points to the string
-  length       length of string, or -1 if the string is zero-terminated
+  length       length of string
   errp         pointer to an error position offset variable
 
-Returns:       = 0    if the string is a valid UTF-8 string
-               > 0    otherwise, setting the offset of the bad character
+Returns:       == 0    if the string is a valid UTF string
+               != 0    otherwise, setting the offset of the bad character
 */
 
-typedef unsigned char *PCRE_PUCHAR;
-#define SUPPORT_UTF
+int
+pa_pcre_valid_utf(PCRE2_SPTR string, int length, int *erroroffset)
+{
+PCRE2_SPTR p;
+uint32_t c;
 
-static const unsigned char utf8_table4[] = {
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-  3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
+/* ----------------- Check a UTF-8 string ----------------- */
 
+#if PCRE2_CODE_UNIT_WIDTH == 8
 
-int
-pa_pcre_valid_utf(PCRE_PUCHAR string, int length, int *erroroffset)
-{
-#ifdef SUPPORT_UTF
-register PCRE_PUCHAR p;
+/* Originally, this function checked according to RFC 2279, allowing for values
+in the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were
+in the canonical format. Once somebody had pointed out RFC 3629 to me (it
+obsoletes 2279), additional restrictions were applied. The values are now
+limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
+subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
+characters is still checked. Error returns are as follows:
 
-if (length < 0)
-  {
-  for (p = string; *p != 0; p++);
-  length = (int)(p - string);
-  }
+PCRE2_ERROR_UTF8_ERR1   Missing 1 byte at the end of the string
+PCRE2_ERROR_UTF8_ERR2   Missing 2 bytes at the end of the string
+PCRE2_ERROR_UTF8_ERR3   Missing 3 bytes at the end of the string
+PCRE2_ERROR_UTF8_ERR4   Missing 4 bytes at the end of the string
+PCRE2_ERROR_UTF8_ERR5   Missing 5 bytes at the end of the string
+PCRE2_ERROR_UTF8_ERR6   2nd-byte's two top bits are not 0x80
+PCRE2_ERROR_UTF8_ERR7   3rd-byte's two top bits are not 0x80
+PCRE2_ERROR_UTF8_ERR8   4th-byte's two top bits are not 0x80
+PCRE2_ERROR_UTF8_ERR9   5th-byte's two top bits are not 0x80
+PCRE2_ERROR_UTF8_ERR10  6th-byte's two top bits are not 0x80
+PCRE2_ERROR_UTF8_ERR11  5-byte character is not permitted by RFC 3629
+PCRE2_ERROR_UTF8_ERR12  6-byte character is not permitted by RFC 3629
+PCRE2_ERROR_UTF8_ERR13  4-byte character with value > 0x10ffff is not permitted
+PCRE2_ERROR_UTF8_ERR14  3-byte character with value 0xd800-0xdfff is not permitted
+PCRE2_ERROR_UTF8_ERR15  Overlong 2-byte sequence
+PCRE2_ERROR_UTF8_ERR16  Overlong 3-byte sequence
+PCRE2_ERROR_UTF8_ERR17  Overlong 4-byte sequence
+PCRE2_ERROR_UTF8_ERR18  Overlong 5-byte sequence (won't ever occur)
+PCRE2_ERROR_UTF8_ERR19  Overlong 6-byte sequence (won't ever occur)
+PCRE2_ERROR_UTF8_ERR20  Isolated 0x80 byte (not within UTF-8 character)
+PCRE2_ERROR_UTF8_ERR21  Byte with the illegal value 0xfe or 0xff
+*/
 
-for (p = string; length-- > 0; p++)
+for (p = string; length > 0; p++)
   {
-  register int ab, c, d;
+  uint32_t ab, d;
 
   c = *p;
+  length--;
+
   if (c < 128) continue;                /* ASCII character */
 
   if (c < 0xc0)                         /* Isolated 10xx xxxx byte */
     {
-    *erroroffset = (int)(p - string);
-    return PCRE_UTF8_ERR20;
+    *erroroffset = p - string;
+    return PCRE2_ERROR_UTF8_ERR20;
     }
 
   if (c >= 0xfe)                        /* Invalid 0xfe or 0xff bytes */
     {
-    *erroroffset = (int)(p - string);
-    return PCRE_UTF8_ERR21;
+    *erroroffset = p - string;
+    return PCRE2_ERROR_UTF8_ERR21;
     }
 
-  ab = utf8_table4[c & 0x3f];     /* Number of additional bytes */
-  if (length < ab)
+  ab = utf8_table4[c & 0x3f];           /* Number of additional bytes (1-5) */
+  if (length < ab)                      /* Missing bytes */
     {
-    *erroroffset = (int)(p - string);          /* Missing bytes */
-    return ab - length;                 /* Codes ERR1 to ERR5 */
+    *erroroffset = p - string;
+    switch(ab - length)
+      {
+      case 1: return PCRE2_ERROR_UTF8_ERR1;
+      case 2: return PCRE2_ERROR_UTF8_ERR2;
+      case 3: return PCRE2_ERROR_UTF8_ERR3;
+      case 4: return PCRE2_ERROR_UTF8_ERR4;
+      case 5: return PCRE2_ERROR_UTF8_ERR5;
+      }
     }
   length -= ab;                         /* Length remaining */
 
@@ -152,7 +175,7 @@
   if (((d = *(++p)) & 0xc0) != 0x80)
     {
     *erroroffset = (int)(p - string) - 1;
-    return PCRE_UTF8_ERR6;
+    return PCRE2_ERROR_UTF8_ERR6;
     }
 
   /* For each length, check that the remaining bytes start with the 0x80 bit
@@ -167,7 +190,7 @@
     case 1: if ((c & 0x3e) == 0)
       {
       *erroroffset = (int)(p - string) - 1;
-      return PCRE_UTF8_ERR15;
+      return PCRE2_ERROR_UTF8_ERR15;
       }
     break;
 
@@ -179,17 +202,17 @@
     if ((*(++p) & 0xc0) != 0x80)     /* Third byte */
       {
       *erroroffset = (int)(p - string) - 2;
-      return PCRE_UTF8_ERR7;
+      return PCRE2_ERROR_UTF8_ERR7;
       }
     if (c == 0xe0 && (d & 0x20) == 0)
       {
       *erroroffset = (int)(p - string) - 2;
-      return PCRE_UTF8_ERR16;
+      return PCRE2_ERROR_UTF8_ERR16;
       }
     if (c == 0xed && d >= 0xa0)
       {
       *erroroffset = (int)(p - string) - 2;
-      return PCRE_UTF8_ERR14;
+      return PCRE2_ERROR_UTF8_ERR14;
       }
     break;
 
@@ -201,22 +224,22 @@
     if ((*(++p) & 0xc0) != 0x80)     /* Third byte */
       {
       *erroroffset = (int)(p - string) - 2;
-      return PCRE_UTF8_ERR7;
+      return PCRE2_ERROR_UTF8_ERR7;
       }
     if ((*(++p) & 0xc0) != 0x80)     /* Fourth byte */
       {
       *erroroffset = (int)(p - string) - 3;
-      return PCRE_UTF8_ERR8;
+      return PCRE2_ERROR_UTF8_ERR8;
       }
     if (c == 0xf0 && (d & 0x30) == 0)
       {
       *erroroffset = (int)(p - string) - 3;
-      return PCRE_UTF8_ERR17;
+      return PCRE2_ERROR_UTF8_ERR17;
       }
     if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
       {
       *erroroffset = (int)(p - string) - 3;
-      return PCRE_UTF8_ERR13;
+      return PCRE2_ERROR_UTF8_ERR13;
       }
     break;
 
@@ -232,22 +255,22 @@
     if ((*(++p) & 0xc0) != 0x80)     /* Third byte */
       {
       *erroroffset = (int)(p - string) - 2;
-      return PCRE_UTF8_ERR7;
+      return PCRE2_ERROR_UTF8_ERR7;
       }
     if ((*(++p) & 0xc0) != 0x80)     /* Fourth byte */
       {
       *erroroffset = (int)(p - string) - 3;
-      return PCRE_UTF8_ERR8;
+      return PCRE2_ERROR_UTF8_ERR8;
       }
     if ((*(++p) & 0xc0) != 0x80)     /* Fifth byte */
       {
       *erroroffset = (int)(p - string) - 4;
-      return PCRE_UTF8_ERR9;
+      return PCRE2_ERROR_UTF8_ERR9;
       }
     if (c == 0xf8 && (d & 0x38) == 0)
       {
       *erroroffset = (int)(p - string) - 4;
-      return PCRE_UTF8_ERR18;
+      return PCRE2_ERROR_UTF8_ERR18;
       }
     break;
 
@@ -258,27 +281,27 @@
     if ((*(++p) & 0xc0) != 0x80)     /* Third byte */
       {
       *erroroffset = (int)(p - string) - 2;
-      return PCRE_UTF8_ERR7;
+      return PCRE2_ERROR_UTF8_ERR7;
       }
     if ((*(++p) & 0xc0) != 0x80)     /* Fourth byte */
       {
       *erroroffset = (int)(p - string) - 3;
-      return PCRE_UTF8_ERR8;
+      return PCRE2_ERROR_UTF8_ERR8;
       }
     if ((*(++p) & 0xc0) != 0x80)     /* Fifth byte */
       {
       *erroroffset = (int)(p - string) - 4;
-      return PCRE_UTF8_ERR9;
+      return PCRE2_ERROR_UTF8_ERR9;
       }
     if ((*(++p) & 0xc0) != 0x80)     /* Sixth byte */
       {
       *erroroffset = (int)(p - string) - 5;
-      return PCRE_UTF8_ERR10;
+      return PCRE2_ERROR_UTF8_ERR10;
       }
     if (c == 0xfc && (d & 0x3c) == 0)
       {
       *erroroffset = (int)(p - string) - 5;
-      return PCRE_UTF8_ERR19;
+      return PCRE2_ERROR_UTF8_ERR19;
       }
     break;
     }
@@ -290,16 +313,89 @@
   if (ab > 3)
     {
     *erroroffset = (int)(p - string) - ab;
-    return (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12;
+    return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12;
+    }
+  }
+return 0;
+
+
+/* ----------------- Check a UTF-16 string ----------------- */
+
+#elif PCRE2_CODE_UNIT_WIDTH == 16
+
+/* There's not so much work, nor so many errors, for UTF-16.
+PCRE2_ERROR_UTF16_ERR1  Missing low surrogate at the end of the string
+PCRE2_ERROR_UTF16_ERR2  Invalid low surrogate
+PCRE2_ERROR_UTF16_ERR3  Isolated low surrogate
+*/
+
+for (p = string; length > 0; p++)
+  {
+  c = *p;
+  length--;
+
+  if ((c & 0xf800) != 0xd800)
+    {
+    /* Normal UTF-16 code point. Neither high nor low surrogate. */
+    }
+  else if ((c & 0x0400) == 0)
+    {
+    /* High surrogate. Must be a followed by a low surrogate. */
+    if (length == 0)
+      {
+      *erroroffset = p - string;
+      return PCRE2_ERROR_UTF16_ERR1;
+      }
+    p++;
+    length--;
+    if ((*p & 0xfc00) != 0xdc00)
+      {
+      *erroroffset = p - string - 1;
+      return PCRE2_ERROR_UTF16_ERR2;
+      }
+    }
+  else
+    {
+    /* Isolated low surrogate. Always an error. */
+    *erroroffset = p - string;
+    return PCRE2_ERROR_UTF16_ERR3;
     }
   }
+return 0;
+
 
-#else  /* SUPPORT_UTF */
-(void)(string);  /* Keep picky compilers happy */
-(void)(length);
-#endif
 
-return PCRE_UTF8_ERR0;   /* This indicates success */
+/* ----------------- Check a UTF-32 string ----------------- */
+
+#else
+
+/* There is very little to do for a UTF-32 string.
+PCRE2_ERROR_UTF32_ERR1  Surrogate character
+PCRE2_ERROR_UTF32_ERR2  Character > 0x10ffff
+*/
+
+for (p = string; length > 0; length--, p++)
+  {
+  c = *p;
+  if ((c & 0xfffff800u) != 0xd800u)
+    {
+    /* Normal UTF-32 code point. Neither high nor low surrogate. */
+    if (c > 0x10ffffu)
+      {
+      *erroroffset = p - string;
+      return PCRE2_ERROR_UTF32_ERR2;
+      }
+    }
+  else
+    {
+    /* A surrogate */
+    *erroroffset = p - string;
+    return PCRE2_ERROR_UTF32_ERR1;
+    }
+  }
+return 0;
+#endif  /* CODE_UNIT_WIDTH */
 }
+#endif  /* SUPPORT_UNICODE */
 
-/* End of pcre_valid_utf8.c */
+/* End of pcre2_valid_utf.c */
--- parser-3.4.6.orig/src/main/pa_common.C
+++ parser-3.4.6/src/main/pa_common.C
@@ -12,7 +12,8 @@
 #include "pa_charsets.h"
 #include "pa_http.h"
 #include "pa_request_charsets.h"
-#include "pcre.h"
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
 #include "pa_request.h"
 
 #include "pa_idna.h"
--- parser-3.4.6.orig/src/main/pa_globals.C
+++ parser-3.4.6/src/main/pa_globals.C
@@ -26,7 +26,6 @@
 #include "pa_cache_managers.h"
 
 #include "ltdl.h"
-#include "pcre.h"
 
 volatile const char * IDENT_PA_GLOBALS_C="$Id: pa_globals.C,v 1.212 2021/01/16 15:47:05 moko Exp $" IDENT_PA_GLOBALS_H IDENT_PA_SAPI_H;
 
@@ -206,10 +205,6 @@
 
 #endif
 
-	// pcre
-	pcre_malloc=pa_malloc;
-	pcre_free=pa_free;
-
 	// cord
 	CORD_oom_fn=pa_CORD_oom_fn;
 }
--- parser-3.4.6.orig/src/types/pa_vregex.h
+++ parser-3.4.6/src/types/pa_vregex.h
@@ -16,7 +16,8 @@
 #include "pa_common.h"
 #include "pa_vstateless_object.h"
 #include "pa_charset.h"
-#include "pcre.h"
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
 
 // defines
 
@@ -62,26 +63,27 @@
 		fpattern(0),
 		foptions_cstr(0),
 		fcode(0),
-		fextra(0),
-		fstudied(false)
+		fgen_ctxt(0),
+		fcmp_ctxt(0),
+		fmatch_ctxt(0),
+		fmatch_data(0)
 	{
 		foptions[0]=0;
 		foptions[1]=0;
 	}
 
-	VRegex(Charset& acharset, const String* aregex, const String* aoptions):
-		fextra(0),
-		fstudied(false)
+	VRegex(Charset& acharset, const String* aregex, const String* aoptions)
 	{
 		set(acharset, aregex, aoptions);
 		compile();
 	}
 
 	~VRegex(){
-		if(fextra)
-			pcre_free(fextra);
-		if(fcode)
-			pcre_free(fcode);
+		pcre2_code_free(fcode);
+		pcre2_match_data_free(fmatch_data);
+		pcre2_match_context_free(fmatch_ctxt);
+		pcre2_compile_context_free(fcmp_ctxt);
+		pcre2_general_context_free(fgen_ctxt);
 	}
 
 	void set(Charset& acharset, const String* aregex, const String* aoptions);
@@ -89,18 +91,16 @@
 
 	void compile();
 
-	void study();
-
-	int exec(const char* string, size_t string_len, int* ovector, int ovector_size, int prestart=0);
+	int exec(const char* string, size_t string_len, int prestart=0);
 
 	// size_t info();
 
+	size_t* get_ovector_ptr();
+
 	size_t full_info(int type);
 
 	size_t get_info_size();
 
-	size_t get_study_size();
-
 	size_t get_options();
 
 	bool is_pre_post_match_needed(){
@@ -120,13 +120,15 @@
 
 private:
 	Charset* fcharset;
-	const char* fpattern;
+	PCRE2_SPTR fpattern;
 	const char* foptions_cstr;
 	int foptions[2];
 
-	pcre* fcode;
-	pcre_extra* fextra;
-	bool fstudied;
+	pcre2_code* fcode;
+	pcre2_general_context* fgen_ctxt;
+	pcre2_compile_context* fcmp_ctxt;
+	pcre2_match_context* fmatch_ctxt;
+	pcre2_match_data* fmatch_data;
 };
 
 
--- parser-3.4.6.orig/src/classes/file.C
+++ parser-3.4.6/src/classes/file.C
@@ -711,7 +711,6 @@
 				} else if(vfilter->is_string()) {
 					if(!vfilter->get_string()->trim().is_empty()) {
 						vregex=new VRegex(r.charsets.source(), &vfilter->as_string(), 0/*options*/);
-						vregex->study();
 						vrcleaner.vregex=vregex;
 					}
 				} else {
@@ -726,14 +725,11 @@
 	Table::Action_options table_options;
 	Table& table=*new Table(file_list_table_template, table_options);
 
-	const int ovector_size=(1/*match*/)*3;
-	int ovector[ovector_size];
-
 	LOAD_DIR(absolute_path_cstr, 
 		const char* file_name_cstr=ffblk.name();
 		size_t file_name_size=strlen(file_name_cstr);
 
-		if(!vregex || vregex->exec(file_name_cstr, file_name_size, ovector, ovector_size)>=0) {
+		if(!vregex || vregex->exec(file_name_cstr, file_name_size)>=0) {
 			Table::element_type row(new ArrayString);
 			*row+=new String(pa_strdup(file_name_cstr, file_name_size), String::L_TAINTED);
 			*row+=new String(String::Body::Format(ffblk.is_dir(stat) ? 1 : 0), String::L_CLEAN);
--- parser-3.4.6.orig/src/main/pa_string.C
+++ parser-3.4.6/src/main/pa_string.C
@@ -652,8 +652,7 @@
 
 	const char* subject=cstr();
 	size_t subject_length=length();
-	const int ovector_size=(1/*match*/+MAX_MATCH_GROUPS)*3;
-	int ovector[ovector_size];
+	size_t* ovector;
 
 	Table::Action_options table_options;
 	Table& table=*new Table(string_match_table_template, table_options);
@@ -662,11 +661,12 @@
 	int poststart=0;
 	int postfinish=length();
 	while(true) {
-		int exec_result=vregex->exec(subject, subject_length, ovector, ovector_size, prestart);
+		int exec_result=vregex->exec(subject, subject_length, prestart);
 
 		if(exec_result<0) // only PCRE_ERROR_NOMATCH might be here, other negative results cause an exception
 			break;
 
+		ovector=vregex->get_ovector_ptr();
 		int prefinish=ovector[0];
 		poststart=ovector[1];
 
--- parser-3.4.6.orig/src/types/pa_vregex.C
+++ parser-3.4.6/src/types/pa_vregex.C
@@ -19,8 +19,28 @@
 
 const char* get_pcre_exec_error_text(int exec_result){
 	switch(exec_result){
-		case PCRE_ERROR_BADUTF8:
-		case PCRE_ERROR_BADUTF8_OFFSET:
+		case PCRE2_ERROR_UTF8_ERR1:
+		case PCRE2_ERROR_UTF8_ERR2:
+		case PCRE2_ERROR_UTF8_ERR3:
+		case PCRE2_ERROR_UTF8_ERR4:
+		case PCRE2_ERROR_UTF8_ERR5:
+		case PCRE2_ERROR_UTF8_ERR6:
+		case PCRE2_ERROR_UTF8_ERR7:
+		case PCRE2_ERROR_UTF8_ERR8:
+		case PCRE2_ERROR_UTF8_ERR9:
+		case PCRE2_ERROR_UTF8_ERR10:
+		case PCRE2_ERROR_UTF8_ERR11:
+		case PCRE2_ERROR_UTF8_ERR12:
+		case PCRE2_ERROR_UTF8_ERR13:
+		case PCRE2_ERROR_UTF8_ERR14:
+		case PCRE2_ERROR_UTF8_ERR15:
+		case PCRE2_ERROR_UTF8_ERR16:
+		case PCRE2_ERROR_UTF8_ERR17:
+		case PCRE2_ERROR_UTF8_ERR18:
+		case PCRE2_ERROR_UTF8_ERR19:
+		case PCRE2_ERROR_UTF8_ERR20:
+		case PCRE2_ERROR_UTF8_ERR21:
+		case PCRE2_ERROR_BADUTFOFFSET:
 			return "UTF-8 validation failed during pcre_exec (%d).";
 			break;
 		default:
@@ -28,6 +48,15 @@
 	}
 }
 
+static void*
+pa_pcre_malloc(size_t size, void *ptr){
+	return pa_malloc(size);
+}
+
+static void
+pa_pcre_free(void *ptr, void *tag){
+	pa_free(ptr);
+}
 
 Value& VRegex::as_expr_result() {
 	return *new VInt(as_int());
@@ -41,19 +70,18 @@
 		int set;
 		int *result;
 	} regex_option[]={
-		{"i", "I", 0, PCRE_CASELESS, result}, // a=A
-		{"s", "S", 0, PCRE_DOTALL, result}, // ^\n\n$ [default]
-		{"m", "M", PCRE_DOTALL, PCRE_MULTILINE, result}, // ^aaa\n$^bbb\n$
-		{"x", 0, 0, PCRE_EXTENDED, result}, // whitespace in regex ignored
-		{"U", 0, 0, PCRE_UNGREEDY, result}, // ungreedy patterns (greedy by default)
+		{"i", "I", 0, PCRE2_CASELESS, result}, // a=A
+		{"s", "S", 0, PCRE2_DOTALL, result}, // ^\n\n$ [default]
+		{"m", "M", PCRE2_DOTALL, PCRE2_MULTILINE, result}, // ^aaa\n$^bbb\n$
+		{"x", 0, 0, PCRE2_EXTENDED, result}, // whitespace in regex ignored
+		{"U", 0, 0, PCRE2_UNGREEDY, result}, // ungreedy patterns (greedy by default)
 		{"g", "G", 0, MF_GLOBAL_SEARCH, result+1}, // many rows
 		{"'", 0, 0, MF_NEED_PRE_POST_MATCH, result+1},
 		{"n", 0, 0, MF_JUST_COUNT_MATCHES, result+1},
 		{0, 0, 0, 0, 0}
 	};
-	result[0]=PCRE_EXTRA /* backslash+non-special char causes error */
-			| PCRE_DOTALL /* dot matches all chars including newline char */
-			| PCRE_DOLLAR_ENDONLY /* dollar matches only end of string, but not newline chars */;
+	result[0]=PCRE2_DOTALL /* dot matches all chars including newline char */
+			| PCRE2_DOLLAR_ENDONLY /* dollar matches only end of string, but not newline chars */;
 	result[1]=0;
 
 	if(options && !options->is_empty()){
@@ -79,7 +107,7 @@
 
 	fcharset=&acharset;
 
-	fpattern=aregex->untaint_cstr(String::L_REGEX);
+	fpattern=reinterpret_cast<const unsigned char*>(aregex->untaint_cstr(String::L_REGEX));
 
 	foptions_cstr=aoptions ? aoptions->cstr() : 0;
 
@@ -99,34 +127,49 @@
 
 
 void VRegex::compile(){
-	const char* err_ptr;
-	int err_offset;
+	int err;
+	size_t err_offset;
 	int options=foptions[0];
 
 	// @todo (for UTF-8): check string & pattern and use PCRE_NO_UTF8_CHECK option 
 	if(fcharset->isUTF8())
-		options |= (PCRE_UTF8 | PCRE_UCP);
+		options |= (PCRE2_UTF | PCRE2_UCP);
+
+	if(!fgen_ctxt)
+		fgen_ctxt=pcre2_general_context_create(pa_pcre_malloc, pa_pcre_free, NULL);
+
+	if(!fcmp_ctxt)
+		fcmp_ctxt=pcre2_compile_context_create(fgen_ctxt);
 
-	fcode=pcre_compile(fpattern, options,
-		&err_ptr, &err_offset,
-		fcharset->pcre_tables);
+	pcre2_set_character_tables(fcmp_ctxt, fcharset->pcre_tables);
+	fcode=pcre2_compile(fpattern, PCRE2_ZERO_TERMINATED, options,
+		&err, &err_offset,
+		fcmp_ctxt);
 
 	if(!fcode){
+		PCRE2_UCHAR buffer[120];
+
+		pcre2_get_error_message(err, buffer, sizeof(buffer));
 		throw Exception(PCRE_EXCEPTION_TYPE,
-			new String(fpattern+err_offset, String::L_TAINTED),
-			"regular expression syntax error - %s", err_ptr);
+			new String(reinterpret_cast<const char*>(fpattern+err_offset), String::L_TAINTED),
+			"regular expression syntax error - %s", buffer);
 	}
 
 }
 
 
+size_t* VRegex::get_ovector_ptr(){
+	return pcre2_get_ovector_pointer(fmatch_data);
+}
+
+
 size_t VRegex::full_info(int type){
 	size_t result;
-	int fullinfo_result=pcre_fullinfo(fcode, fextra, type, &result);
+	int fullinfo_result=pcre2_pattern_info(fcode, type, &result);
 	if(fullinfo_result<0){
 		throw Exception(PCRE_EXCEPTION_TYPE,
-			new String(fpattern, String::L_TAINTED),
-			"pcre_full_info error (%d)", fullinfo_result);
+			new String(reinterpret_cast<const char*>(fpattern), String::L_TAINTED),
+			"pcre2_pattern_info error (%d)", fullinfo_result);
 	}
 
 	return result;
@@ -134,39 +177,24 @@
 
 
 size_t VRegex::get_info_size(){
-	return full_info(PCRE_INFO_SIZE);
+	return full_info(PCRE2_INFO_SIZE);
 }
 
 
-size_t VRegex::get_study_size(){
-	return full_info(PCRE_INFO_STUDYSIZE);
-}
-
-void VRegex::study(){
-	if(fstudied)
-		return;
-
-	const char* err_ptr;
-	fextra=pcre_study(fcode, 0/*options*/, &err_ptr);
-
-	if(err_ptr){
-		throw Exception(PCRE_EXCEPTION_TYPE,
-			new String(fpattern, String::L_TAINTED),
-			"pcre_study error: %s", err_ptr);
-	}
-
-	fstudied=true;
-}
+int VRegex::exec(const char* string, size_t string_len, int prestart){
+	if(!fmatch_data)
+		fmatch_data=pcre2_match_data_create_from_pattern(fcode, fgen_ctxt);
 
+	if(!fmatch_ctxt)
+		fmatch_ctxt=pcre2_match_context_create(fgen_ctxt);
 
-int VRegex::exec(const char* string, size_t string_len, int* ovector, int ovector_size, int prestart){
-	int result=pcre_exec(fcode, fextra, 
-		string, string_len, prestart,
-		prestart>0 ? PCRE_NO_UTF8_CHECK : 0, ovector, ovector_size);
+	int result=pcre2_match(fcode,
+		reinterpret_cast<const unsigned char*>(string), string_len, prestart,
+		prestart>0 ? PCRE2_NO_UTF_CHECK : 0, fmatch_data, fmatch_ctxt);
 			
-	if(result<0 && result!=PCRE_ERROR_NOMATCH){
+	if(result<0 && result!=PCRE2_ERROR_NOMATCH){
 		throw Exception(PCRE_EXCEPTION_TYPE, 
-			new String(fpattern, String::L_TAINTED),
+			new String(reinterpret_cast<const char*>(fpattern), String::L_TAINTED),
 			get_pcre_exec_error_text(result), result);
 	}
 
@@ -176,7 +204,7 @@
 
 Value* VRegex::get_element(const String& aname) { 
 	if(aname == REGEX_PATTERN_NAME)
-		return new VString(*new String(fpattern, String::L_TAINTED));
+		return new VString(*new String(reinterpret_cast<const char*>(fpattern), String::L_TAINTED));
 
 	if(aname == REGEX_OPTIONS_NAME)
 		return new VString(*new String(foptions_cstr, String::L_TAINTED));
--- parser-3.4.6.orig/src/types/pa_vmail.C
+++ parser-3.4.6/src/types/pa_vmail.C
@@ -484,7 +484,7 @@
 size_t mail_header_utf8_substring(const char *mail, size_t sub_length, size_t length){
 	int error_offset;
 	if(int error_code=pa_pcre_valid_utf((unsigned char *)mail, sub_length, &error_offset)){
-		if(error_code<PCRE_UTF8_ERR6){ // Missing X byte at the end of the string errors
+		if(error_code<PCRE2_ERROR_UTF8_ERR6){ // Missing X byte at the end of the string errors
 			sub_length+=error_code; // adding X bytes
 			return sub_length < length ? sub_length : length;
 		}
--- parser-3.4.6.orig/src/lib/pcre/pa_pcre_internal.h
+++ parser-3.4.6/src/lib/pcre/pa_pcre_internal.h
@@ -46,8 +46,8 @@
 for this function is in the pcre_valid_utf8.c module. */
 
 #ifdef __cplusplus
-    extern "C" int pa_pcre_valid_utf(unsigned char *string, int length, int *erroroffset);
+    extern "C" int pa_pcre_valid_utf(const unsigned char *string, int length, int *erroroffset);
 #else
-    extern int pa_pcre_valid_utf(unsigned char *string, int length, int *erroroffset);
+    extern int pa_pcre_valid_utf(const unsigned char *string, int length, int *erroroffset);
 #endif
 
--- parser-3.4.6.orig/src/classes/regex.C
+++ parser-3.4.6/src/classes/regex.C
@@ -40,7 +40,6 @@
 	}
 
 	vregex.compile();
-	vregex.study();
 }
 
 
@@ -49,11 +48,6 @@
 	r.write(*new VInt(vregex.get_info_size()));
 }
 
-static void _study_size(Request& r, MethodParams&) {
-	VRegex& vregex=GET_SELF(r, VRegex);
-	r.write(*new VInt(vregex.get_study_size()));
-}
-
 // constructor
 
 MRegex::MRegex(): Methoded("regex") {
@@ -63,8 +57,5 @@
 	// ^regex.info_size[]
 	add_native_method("size", Method::CT_DYNAMIC, _size, 0, 0);
 
-	// ^regex.study_size[]
-	add_native_method("study_size", Method::CT_DYNAMIC, _study_size, 0, 0);
-
 }
 
--- parser-3.4.6.orig/src/classes/string.C
+++ parser-3.4.6/src/classes/string.C
@@ -223,7 +223,6 @@
 static void split_list(Value& delim_value, const String& string, ArrayString& result) {
 	if(Value* value=delim_value.as(VREGEX_TYPE)){
 		VRegex *vregex=static_cast<VRegex*>(value);
-		vregex->study();
 
 		int matches_count=0;
 		Split_action_info ai = { string, result };
@@ -396,7 +395,6 @@
 		vregex=static_cast<VRegex*>(value);
 	} else {
 		vregex=new VRegex(r.charsets.source(), &regexp.as_string(), (options) ? (&options->as_string()) : 0);
-		vregex->study();
 		vrcleaner.vregex=vregex;
 	}

Attachment: signature.asc
Description: This is a digitally signed message part

Reply to:

Prev by Date: python-qrencode_1.2-6_source.changes ACCEPTED into unstable
Next by Date: Bug#1024286: lcm: reproducible-builds: Embedded build path and usrmerge paths in Makefile
Previous by thread: python-qrencode_1.2-6_source.changes ACCEPTED into unstable
Next by thread: Bug#1024286: lcm: reproducible-builds: Embedded build path and usrmerge paths in Makefile
Index(es):
- Date
- Thread