sys-devel/libperl/files/libperl-5.8.8-CVE-2008-1927.patch - third_party/overlays/portage-stable - Git at Google

 Fix a double free / segfault with utf8 regexps
 Debian #454792
 [rt.cpan.org #48156]
 [rt.cpan.org #40641]
 upstream change 29204

 UTF8_ALLOW_DEFAULT definition in utf8.h picked from upstream change 27688

 diff --git a/embed.fnc b/embed.fnc
 index edfbc0e..26524c7 100644
 --- a/embed.fnc
 +++ b/embed.fnc
 @@ -1168,6 +1168,7 @@ Es	|void	|reguni		|NN const struct RExC_state_t *state|UV uv|NN char *s|NN STRLE
  Es	|regnode*|regclass	|NN struct RExC_state_t *state
  ERs	|I32	|regcurly	|NN const char *
  Es	|regnode*|reg_node	|NN struct RExC_state_t *state|U8 op
 +Es	|UV	|reg_recode	|const char value|NULLOK SV **encp
  Es	|regnode*|regpiece	|NN struct RExC_state_t *state|NN I32 *flagp
  Es	|void	|reginsert	|NN struct RExC_state_t *state|U8 op|NN regnode *opnd
  Es	|void	|regoptail	|NN struct RExC_state_t *state|NN regnode *p|NN regnode *val
 diff --git a/embed.h b/embed.h
 index 2b38fd5..372b04f 100644
 --- a/embed.h
 +++ b/embed.h
 @@ -1234,6 +1234,7 @@
  #define regclass		S_regclass
  #define regcurly		S_regcurly
  #define reg_node		S_reg_node
 +#define reg_recode		S_reg_recode
  #define regpiece		S_regpiece
  #define reginsert		S_reginsert
  #define regoptail		S_regoptail
 @@ -3277,6 +3278,7 @@
  #define regclass(a)		S_regclass(aTHX_ a)
  #define regcurly(a)		S_regcurly(aTHX_ a)
  #define reg_node(a,b)		S_reg_node(aTHX_ a,b)
 +#define reg_recode(a,b)		S_reg_recode(aTHX_ a,b)
  #define regpiece(a,b)		S_regpiece(aTHX_ a,b)
  #define reginsert(a,b,c)	S_reginsert(aTHX_ a,b,c)
  #define regoptail(a,b,c)	S_regoptail(aTHX_ a,b,c)
 diff --git a/pod/perldiag.pod b/pod/perldiag.pod
 index 9b3134c..7d95216 100644
 --- a/pod/perldiag.pod
 +++ b/pod/perldiag.pod
 @@ -1900,6 +1900,15 @@ recognized by Perl or by a user-supplied handler.  See L<attributes>.
  (W printf) Perl does not understand the given format conversion.  See
  L<perlfunc/sprintf>.

 +=item Invalid escape in the specified encoding in regex; marked by <-- HERE in m/%s/
 +
 +(W regexp) The numeric escape (for example C<\xHH>) of value < 256
 +didn't correspond to a single character through the conversion
 +from the encoding specified by the encoding pragma.
 +The escape was replaced with REPLACEMENT CHARACTER (U+FFFD) instead.
 +The <-- HERE shows in the regular expression about where the
 +escape was discovered.
 +
  =item Invalid [] range "%s" in regex; marked by <-- HERE in m/%s/

  (F) The range specified in a character class had a minimum character
 diff --git a/proto.h b/proto.h
 index 6d185dd..ef6c0cf 100644
 --- a/proto.h
 +++ b/proto.h
 @@ -1748,6 +1748,7 @@ STATIC I32	S_regcurly(pTHX_ const char *)
  			__attribute__warn_unused_result__;

  STATIC regnode*	S_reg_node(pTHX_ struct RExC_state_t *state, U8 op);
 +STATIC UV	S_reg_recode(pTHX_ const char value, SV **encp);
  STATIC regnode*	S_regpiece(pTHX_ struct RExC_state_t *state, I32 *flagp);
  STATIC void	S_reginsert(pTHX_ struct RExC_state_t *state, U8 op, regnode *opnd);
  STATIC void	S_regoptail(pTHX_ struct RExC_state_t *state, regnode *p, regnode *val);
 diff --git a/regcomp.c b/regcomp.c
 index 928cf39..98d48dd 100644
 --- a/regcomp.c
 +++ b/regcomp.c
 @@ -2791,6 +2791,39 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
  }

  /*
 + * reg_recode
 + *
 + * It returns the code point in utf8 for the value in *encp.
 + *    value: a code value in the source encoding
 + *    encp:  a pointer to an Encode object
 + *
 + * If the result from Encode is not a single character,
 + * it returns U+FFFD (Replacement character) and sets *encp to NULL.
 + */
 +STATIC UV
 +S_reg_recode(pTHX_ const char value, SV **encp)
 +{
 +    STRLEN numlen = 1;
 +    SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
 +    const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
 +					 : SvPVX(sv);
 +    const STRLEN newlen = SvCUR(sv);
 +    UV uv = UNICODE_REPLACEMENT;
 +
 +    if (newlen)
 +	uv = SvUTF8(sv)
 +	     ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
 +	     : *(U8*)s;
 +
 +    if (!newlen || numlen != newlen) {
 +	uv = UNICODE_REPLACEMENT;
 +	if (encp)
 +	    *encp = NULL;
 +    }
 +    return uv;
 +}
 +
 +/*
   - regatom - the lowest level
   *
   * Optimization:  gobbles an entire sequence of ordinary characters so that
 @@ -3182,6 +3215,8 @@ tryagain:
  			    ender = grok_hex(p, &numlen, &flags, NULL);
  			    p += numlen;
  			}
 +			if (PL_encoding && ender < 0x100)
 +			    goto recode_encoding;
  			break;
  		    case 'c':
  			p++;
 @@ -3201,6 +3236,17 @@ tryagain:
  			    --p;
  			    goto loopdone;
  			}
 +			if (PL_encoding && ender < 0x100)
 +			    goto recode_encoding;
 +			break;
 +		    recode_encoding:
 +			{
 +			    SV* enc = PL_encoding;
 +			    ender = reg_recode((const char)(U8)ender, &enc);
 +			    if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
 +				vWARN(p, "Invalid escape in the specified encoding");
 +			    RExC_utf8 = 1;
 +			}
  			break;
  		    case '\0':
  			if (p >= RExC_end)
 @@ -3331,32 +3377,6 @@ tryagain:
  	break;
      }

 -    /* If the encoding pragma is in effect recode the text of
 -     * any EXACT-kind nodes. */
 -    if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
 -	STRLEN oldlen = STR_LEN(ret);
 -	SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
 -
 -	if (RExC_utf8)
 -	    SvUTF8_on(sv);
 -	if (sv_utf8_downgrade(sv, TRUE)) {
 -	    const char * const s = sv_recode_to_utf8(sv, PL_encoding);
 -	    const STRLEN newlen = SvCUR(sv);
 -
 -	    if (SvUTF8(sv))
 -		RExC_utf8 = 1;
 -	    if (!SIZE_ONLY) {
 -		DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
 -				      (int)oldlen, STRING(ret),
 -				      (int)newlen, s));
 -		Copy(s, STRING(ret), newlen, char);
 -		STR_LEN(ret) += newlen - oldlen;
 -		RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
 -	    } else
 -		RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
 -	}
 -    }
 -
      return(ret);
  }

 @@ -3734,6 +3754,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
  		    value = grok_hex(RExC_parse, &numlen, &flags, NULL);
  		    RExC_parse += numlen;
  		}
 +		if (PL_encoding && value < 0x100)
 +		    goto recode_encoding;
  		break;
  	    case 'c':
  		value = UCHARAT(RExC_parse++);
 @@ -3741,13 +3763,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
  		break;
  	    case '0': case '1': case '2': case '3': case '4':
  	    case '5': case '6': case '7': case '8': case '9':
 -            {
 -                I32 flags = 0;
 -		numlen = 3;
 -		value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
 -		RExC_parse += numlen;
 -		break;
 -            }
 +		{
 +		    I32 flags = 0;
 +		    numlen = 3;
 +		    value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
 +		    RExC_parse += numlen;
 +		    if (PL_encoding && value < 0x100)
 +			goto recode_encoding;
 +		    break;
 +		}
 +	    recode_encoding:
 +		{
 +		    SV* enc = PL_encoding;
 +		    value = reg_recode((const char)(U8)value, &enc);
 +		    if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
 +			vWARN(RExC_parse,
 +			      "Invalid escape in the specified encoding");
 +		    break;
 +		}
  	    default:
  		if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
  		    vWARN2(RExC_parse,
 diff --git a/t/uni/tr_utf8.t b/t/uni/tr_utf8.t
 index 606a84a..354156a 100755
 --- a/t/uni/tr_utf8.t
 +++ b/t/uni/tr_utf8.t
 @@ -31,7 +31,7 @@ BEGIN {
  }

  use strict;
 -use Test::More tests => 7;
 +use Test::More tests => 8;

  use encoding 'utf8';

 @@ -67,4 +67,12 @@ is($str, $hiragana, "s/// # hiragana -> katakana");
    $line =~ tr/bcdeghijklmnprstvwxyz$02578/בצדעגהיײקלמנפּרסטװשכיזשױתײחא/;
    is($line, "aבצדעfגהיײקלמנoפqּרסuטװשכיזש1ױ34ת6ײח9", "[perl #16843]");
  }
 +
 +{
 +  # [perl #40641]
 +  my $str = qq/Gebääääääääääääääääääääude/;
 +  my $reg = qr/Gebääääääääääääääääääääude/;
 +  ok($str =~ /$reg/, "[perl #40641]");
 +}
 +
  __END__
 diff --git a/utf8.h b/utf8.h
 index 6d63897..3800866 100644
 --- a/utf8.h
 +++ b/utf8.h
 @@ -198,6 +198,8 @@ encoded character.
  					 UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
  #define UTF8_ALLOW_ANY			0x00FF
  #define UTF8_CHECK_ONLY			0x0200
 +#define UTF8_ALLOW_DEFAULT             (ckWARN(WARN_UTF8) ? 0 : \
 +                                        UTF8_ALLOW_ANYUV)

  #define UNICODE_SURROGATE_FIRST		0xD800
  #define UNICODE_SURROGATE_LAST		0xDFFF
	Fix a double free / segfault with utf8 regexps
	Debian #454792
	[rt.cpan.org #48156]
	[rt.cpan.org #40641]
	upstream change 29204

	UTF8_ALLOW_DEFAULT definition in utf8.h picked from upstream change 27688

	diff --git a/embed.fnc b/embed.fnc
	index edfbc0e..26524c7 100644
	--- a/embed.fnc
	+++ b/embed.fnc
	@@ -1168,6 +1168,7 @@ Es \|void \|reguni \|NN const struct RExC_state_t state\|UV uv\|NN char s\|NN STRLE
	Es \|regnode\|regclass \|NN struct RExC_state_t state
	ERs \|I32 \|regcurly \|NN const char *
	Es \|regnode\|reg_node \|NN struct RExC_state_t state\|U8 op
	+Es \|UV \|reg_recode \|const char value\|NULLOK SV **encp
	Es \|regnode\|regpiece \|NN struct RExC_state_t state\|NN I32 *flagp
	Es \|void \|reginsert \|NN struct RExC_state_t state\|U8 op\|NN regnode opnd
	Es \|void \|regoptail \|NN struct RExC_state_t state\|NN regnode p\|NN regnode *val
	diff --git a/embed.h b/embed.h
	index 2b38fd5..372b04f 100644
	--- a/embed.h
	+++ b/embed.h
	@@ -1234,6 +1234,7 @@
	#define regclass S_regclass
	#define regcurly S_regcurly
	#define reg_node S_reg_node
	+#define reg_recode S_reg_recode
	#define regpiece S_regpiece
	#define reginsert S_reginsert
	#define regoptail S_regoptail
	@@ -3277,6 +3278,7 @@
	#define regclass(a) S_regclass(aTHX_ a)
	#define regcurly(a) S_regcurly(aTHX_ a)
	#define reg_node(a,b) S_reg_node(aTHX_ a,b)
	+#define reg_recode(a,b) S_reg_recode(aTHX_ a,b)
	#define regpiece(a,b) S_regpiece(aTHX_ a,b)
	#define reginsert(a,b,c) S_reginsert(aTHX_ a,b,c)
	#define regoptail(a,b,c) S_regoptail(aTHX_ a,b,c)
	diff --git a/pod/perldiag.pod b/pod/perldiag.pod
	index 9b3134c..7d95216 100644
	--- a/pod/perldiag.pod
	+++ b/pod/perldiag.pod
	@@ -1900,6 +1900,15 @@ recognized by Perl or by a user-supplied handler. See L<attributes>.
	(W printf) Perl does not understand the given format conversion. See
	L<perlfunc/sprintf>.

	+=item Invalid escape in the specified encoding in regex; marked by <-- HERE in m/%s/
	+
	+(W regexp) The numeric escape (for example C<\xHH>) of value < 256
	+didn't correspond to a single character through the conversion
	+from the encoding specified by the encoding pragma.
	+The escape was replaced with REPLACEMENT CHARACTER (U+FFFD) instead.
	+The <-- HERE shows in the regular expression about where the
	+escape was discovered.
	+
	=item Invalid [] range "%s" in regex; marked by <-- HERE in m/%s/

	(F) The range specified in a character class had a minimum character
	diff --git a/proto.h b/proto.h
	index 6d185dd..ef6c0cf 100644
	--- a/proto.h
	+++ b/proto.h
	@@ -1748,6 +1748,7 @@ STATIC I32 S_regcurly(pTHX_ const char *)
	__attribute__warn_unused_result__;

	STATIC regnode* S_reg_node(pTHX_ struct RExC_state_t *state, U8 op);
	+STATIC UV S_reg_recode(pTHX_ const char value, SV **encp);
	STATIC regnode* S_regpiece(pTHX_ struct RExC_state_t state, I32 flagp);
	STATIC void S_reginsert(pTHX_ struct RExC_state_t state, U8 op, regnode opnd);
	STATIC void S_regoptail(pTHX_ struct RExC_state_t state, regnode p, regnode *val);
	diff --git a/regcomp.c b/regcomp.c
	index 928cf39..98d48dd 100644
	--- a/regcomp.c
	+++ b/regcomp.c
	@@ -2791,6 +2791,39 @@ S_regpiece(pTHX_ RExC_state_t pRExC_state, I32 flagp)
	}

	/*
	+ * reg_recode
	+ *
	+ * It returns the code point in utf8 for the value in *encp.
	+ * value: a code value in the source encoding
	+ * encp: a pointer to an Encode object
	+ *
	+ * If the result from Encode is not a single character,
	+ * it returns U+FFFD (Replacement character) and sets *encp to NULL.
	+ */
	+STATIC UV
	+S_reg_recode(pTHX_ const char value, SV **encp)
	+{
	+ STRLEN numlen = 1;
	+ SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
	+ const char * const s = encp && encp ? sv_recode_to_utf8(sv, encp)
	+ : SvPVX(sv);
	+ const STRLEN newlen = SvCUR(sv);
	+ UV uv = UNICODE_REPLACEMENT;
	+
	+ if (newlen)
	+ uv = SvUTF8(sv)
	+ ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
	+ : (U8)s;
	+
	+ if (!newlen \|\| numlen != newlen) {
	+ uv = UNICODE_REPLACEMENT;
	+ if (encp)
	+ *encp = NULL;
	+ }
	+ return uv;
	+}
	+
	+/*
	- regatom - the lowest level
	*
	* Optimization: gobbles an entire sequence of ordinary characters so that
	@@ -3182,6 +3215,8 @@ tryagain:
	ender = grok_hex(p, &numlen, &flags, NULL);
	p += numlen;
	}
	+ if (PL_encoding && ender < 0x100)
	+ goto recode_encoding;
	break;
	case 'c':
	p++;
	@@ -3201,6 +3236,17 @@ tryagain:
	--p;
	goto loopdone;
	}
	+ if (PL_encoding && ender < 0x100)
	+ goto recode_encoding;
	+ break;
	+ recode_encoding:
	+ {
	+ SV* enc = PL_encoding;
	+ ender = reg_recode((const char)(U8)ender, &enc);
	+ if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
	+ vWARN(p, "Invalid escape in the specified encoding");
	+ RExC_utf8 = 1;
	+ }
	break;
	case '\0':
	if (p >= RExC_end)
	@@ -3331,32 +3377,6 @@ tryagain:
	break;
	}

	- /* If the encoding pragma is in effect recode the text of
	- * any EXACT-kind nodes. */
	- if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
	- STRLEN oldlen = STR_LEN(ret);
	- SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
	-
	- if (RExC_utf8)
	- SvUTF8_on(sv);
	- if (sv_utf8_downgrade(sv, TRUE)) {
	- const char * const s = sv_recode_to_utf8(sv, PL_encoding);
	- const STRLEN newlen = SvCUR(sv);
	-
	- if (SvUTF8(sv))
	- RExC_utf8 = 1;
	- if (!SIZE_ONLY) {
	- DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %s to %s\n",
	- (int)oldlen, STRING(ret),
	- (int)newlen, s));
	- Copy(s, STRING(ret), newlen, char);
	- STR_LEN(ret) += newlen - oldlen;
	- RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
	- } else
	- RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
	- }
	- }
	-
	return(ret);
	}

	@@ -3734,6 +3754,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
	value = grok_hex(RExC_parse, &numlen, &flags, NULL);
	RExC_parse += numlen;
	}
	+ if (PL_encoding && value < 0x100)
	+ goto recode_encoding;
	break;
	case 'c':
	value = UCHARAT(RExC_parse++);
	@@ -3741,13 +3763,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
	break;
	case '0': case '1': case '2': case '3': case '4':
	case '5': case '6': case '7': case '8': case '9':
	- {
	- I32 flags = 0;
	- numlen = 3;
	- value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
	- RExC_parse += numlen;
	- break;
	- }
	+ {
	+ I32 flags = 0;
	+ numlen = 3;
	+ value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
	+ RExC_parse += numlen;
	+ if (PL_encoding && value < 0x100)
	+ goto recode_encoding;
	+ break;
	+ }
	+ recode_encoding:
	+ {
	+ SV* enc = PL_encoding;
	+ value = reg_recode((const char)(U8)value, &enc);
	+ if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
	+ vWARN(RExC_parse,
	+ "Invalid escape in the specified encoding");
	+ break;
	+ }
	default:
	if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
	vWARN2(RExC_parse,
	diff --git a/t/uni/tr_utf8.t b/t/uni/tr_utf8.t
	index 606a84a..354156a 100755
	--- a/t/uni/tr_utf8.t
	+++ b/t/uni/tr_utf8.t
	@@ -31,7 +31,7 @@ BEGIN {
	}

	use strict;
	-use Test::More tests => 7;
	+use Test::More tests => 8;

	use encoding 'utf8';

	@@ -67,4 +67,12 @@ is($str, $hiragana, "s/// # hiragana -> katakana");
	$line =~ tr/bcdeghijklmnprstvwxyz$02578/בצדעגהיײקלמנפּרסטװשכיזשױתײחא/;
	is($line, "aבצדעfגהיײקלמנoפqּרסuטװשכיזש1ױ34ת6ײח9", "[perl #16843]");
	}
	+
	+{
	+ # [perl #40641]
	+ my $str = qq/Gebääääääääääääääääääääude/;
	+ my $reg = qr/Gebääääääääääääääääääääude/;
	+ ok($str =~ /$reg/, "[perl #40641]");
	+}
	+
	__END__
	diff --git a/utf8.h b/utf8.h
	index 6d63897..3800866 100644
	--- a/utf8.h
	+++ b/utf8.h
	@@ -198,6 +198,8 @@ encoded character.
	UTF8_ALLOW_SURROGATE\|UTF8_ALLOW_FFFF)
	#define UTF8_ALLOW_ANY 0x00FF
	#define UTF8_CHECK_ONLY 0x0200
	+#define UTF8_ALLOW_DEFAULT (ckWARN(WARN_UTF8) ? 0 : \
	+ UTF8_ALLOW_ANYUV)

	#define UNICODE_SURROGATE_FIRST 0xD800
	#define UNICODE_SURROGATE_LAST 0xDFFF