| Fix a double free / segfault with utf8 regexps |
| Debian #454792 |
| [rt.cpan.org #48156] |
| [rt.cpan.org #40641] |
| upstream change 29204 |
| |
| UTF8_ALLOW_DEFAULT definition in utf8.h picked from upstream change 27688 |
| |
| diff --git a/embed.fnc b/embed.fnc |
| index edfbc0e..26524c7 100644 |
| --- a/embed.fnc |
| +++ b/embed.fnc |
| @@ -1168,6 +1168,7 @@ Es |void |reguni |NN const struct RExC_state_t *state|UV uv|NN char *s|NN STRLE |
| Es |regnode*|regclass |NN struct RExC_state_t *state |
| ERs |I32 |regcurly |NN const char * |
| Es |regnode*|reg_node |NN struct RExC_state_t *state|U8 op |
| +Es |UV |reg_recode |const char value|NULLOK SV **encp |
| Es |regnode*|regpiece |NN struct RExC_state_t *state|NN I32 *flagp |
| Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd |
| Es |void |regoptail |NN struct RExC_state_t *state|NN regnode *p|NN regnode *val |
| diff --git a/embed.h b/embed.h |
| index 2b38fd5..372b04f 100644 |
| --- a/embed.h |
| +++ b/embed.h |
| @@ -1234,6 +1234,7 @@ |
| #define regclass S_regclass |
| #define regcurly S_regcurly |
| #define reg_node S_reg_node |
| +#define reg_recode S_reg_recode |
| #define regpiece S_regpiece |
| #define reginsert S_reginsert |
| #define regoptail S_regoptail |
| @@ -3277,6 +3278,7 @@ |
| #define regclass(a) S_regclass(aTHX_ a) |
| #define regcurly(a) S_regcurly(aTHX_ a) |
| #define reg_node(a,b) S_reg_node(aTHX_ a,b) |
| +#define reg_recode(a,b) S_reg_recode(aTHX_ a,b) |
| #define regpiece(a,b) S_regpiece(aTHX_ a,b) |
| #define reginsert(a,b,c) S_reginsert(aTHX_ a,b,c) |
| #define regoptail(a,b,c) S_regoptail(aTHX_ a,b,c) |
| diff --git a/pod/perldiag.pod b/pod/perldiag.pod |
| index 9b3134c..7d95216 100644 |
| --- a/pod/perldiag.pod |
| +++ b/pod/perldiag.pod |
| @@ -1900,6 +1900,15 @@ recognized by Perl or by a user-supplied handler. See L<attributes>. |
| (W printf) Perl does not understand the given format conversion. See |
| L<perlfunc/sprintf>. |
| |
| +=item Invalid escape in the specified encoding in regex; marked by <-- HERE in m/%s/ |
| + |
| +(W regexp) The numeric escape (for example C<\xHH>) of value < 256 |
| +didn't correspond to a single character through the conversion |
| +from the encoding specified by the encoding pragma. |
| +The escape was replaced with REPLACEMENT CHARACTER (U+FFFD) instead. |
| +The <-- HERE shows in the regular expression about where the |
| +escape was discovered. |
| + |
| =item Invalid [] range "%s" in regex; marked by <-- HERE in m/%s/ |
| |
| (F) The range specified in a character class had a minimum character |
| diff --git a/proto.h b/proto.h |
| index 6d185dd..ef6c0cf 100644 |
| --- a/proto.h |
| +++ b/proto.h |
| @@ -1748,6 +1748,7 @@ STATIC I32 S_regcurly(pTHX_ const char *) |
| __attribute__warn_unused_result__; |
| |
| STATIC regnode* S_reg_node(pTHX_ struct RExC_state_t *state, U8 op); |
| +STATIC UV S_reg_recode(pTHX_ const char value, SV **encp); |
| STATIC regnode* S_regpiece(pTHX_ struct RExC_state_t *state, I32 *flagp); |
| STATIC void S_reginsert(pTHX_ struct RExC_state_t *state, U8 op, regnode *opnd); |
| STATIC void S_regoptail(pTHX_ struct RExC_state_t *state, regnode *p, regnode *val); |
| diff --git a/regcomp.c b/regcomp.c |
| index 928cf39..98d48dd 100644 |
| --- a/regcomp.c |
| +++ b/regcomp.c |
| @@ -2791,6 +2791,39 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) |
| } |
| |
| /* |
| + * reg_recode |
| + * |
| + * It returns the code point in utf8 for the value in *encp. |
| + * value: a code value in the source encoding |
| + * encp: a pointer to an Encode object |
| + * |
| + * If the result from Encode is not a single character, |
| + * it returns U+FFFD (Replacement character) and sets *encp to NULL. |
| + */ |
| +STATIC UV |
| +S_reg_recode(pTHX_ const char value, SV **encp) |
| +{ |
| + STRLEN numlen = 1; |
| + SV * const sv = sv_2mortal(newSVpvn(&value, numlen)); |
| + const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp) |
| + : SvPVX(sv); |
| + const STRLEN newlen = SvCUR(sv); |
| + UV uv = UNICODE_REPLACEMENT; |
| + |
| + if (newlen) |
| + uv = SvUTF8(sv) |
| + ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) |
| + : *(U8*)s; |
| + |
| + if (!newlen || numlen != newlen) { |
| + uv = UNICODE_REPLACEMENT; |
| + if (encp) |
| + *encp = NULL; |
| + } |
| + return uv; |
| +} |
| + |
| +/* |
| - regatom - the lowest level |
| * |
| * Optimization: gobbles an entire sequence of ordinary characters so that |
| @@ -3182,6 +3215,8 @@ tryagain: |
| ender = grok_hex(p, &numlen, &flags, NULL); |
| p += numlen; |
| } |
| + if (PL_encoding && ender < 0x100) |
| + goto recode_encoding; |
| break; |
| case 'c': |
| p++; |
| @@ -3201,6 +3236,17 @@ tryagain: |
| --p; |
| goto loopdone; |
| } |
| + if (PL_encoding && ender < 0x100) |
| + goto recode_encoding; |
| + break; |
| + recode_encoding: |
| + { |
| + SV* enc = PL_encoding; |
| + ender = reg_recode((const char)(U8)ender, &enc); |
| + if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) |
| + vWARN(p, "Invalid escape in the specified encoding"); |
| + RExC_utf8 = 1; |
| + } |
| break; |
| case '\0': |
| if (p >= RExC_end) |
| @@ -3331,32 +3377,6 @@ tryagain: |
| break; |
| } |
| |
| - /* If the encoding pragma is in effect recode the text of |
| - * any EXACT-kind nodes. */ |
| - if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) { |
| - STRLEN oldlen = STR_LEN(ret); |
| - SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen)); |
| - |
| - if (RExC_utf8) |
| - SvUTF8_on(sv); |
| - if (sv_utf8_downgrade(sv, TRUE)) { |
| - const char * const s = sv_recode_to_utf8(sv, PL_encoding); |
| - const STRLEN newlen = SvCUR(sv); |
| - |
| - if (SvUTF8(sv)) |
| - RExC_utf8 = 1; |
| - if (!SIZE_ONLY) { |
| - DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n", |
| - (int)oldlen, STRING(ret), |
| - (int)newlen, s)); |
| - Copy(s, STRING(ret), newlen, char); |
| - STR_LEN(ret) += newlen - oldlen; |
| - RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen); |
| - } else |
| - RExC_size += STR_SZ(newlen) - STR_SZ(oldlen); |
| - } |
| - } |
| - |
| return(ret); |
| } |
| |
| @@ -3734,6 +3754,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) |
| value = grok_hex(RExC_parse, &numlen, &flags, NULL); |
| RExC_parse += numlen; |
| } |
| + if (PL_encoding && value < 0x100) |
| + goto recode_encoding; |
| break; |
| case 'c': |
| value = UCHARAT(RExC_parse++); |
| @@ -3741,13 +3763,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) |
| break; |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| - { |
| - I32 flags = 0; |
| - numlen = 3; |
| - value = grok_oct(--RExC_parse, &numlen, &flags, NULL); |
| - RExC_parse += numlen; |
| - break; |
| - } |
| + { |
| + I32 flags = 0; |
| + numlen = 3; |
| + value = grok_oct(--RExC_parse, &numlen, &flags, NULL); |
| + RExC_parse += numlen; |
| + if (PL_encoding && value < 0x100) |
| + goto recode_encoding; |
| + break; |
| + } |
| + recode_encoding: |
| + { |
| + SV* enc = PL_encoding; |
| + value = reg_recode((const char)(U8)value, &enc); |
| + if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) |
| + vWARN(RExC_parse, |
| + "Invalid escape in the specified encoding"); |
| + break; |
| + } |
| default: |
| if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP)) |
| vWARN2(RExC_parse, |
| diff --git a/t/uni/tr_utf8.t b/t/uni/tr_utf8.t |
| index 606a84a..354156a 100755 |
| --- a/t/uni/tr_utf8.t |
| +++ b/t/uni/tr_utf8.t |
| @@ -31,7 +31,7 @@ BEGIN { |
| } |
| |
| use strict; |
| -use Test::More tests => 7; |
| +use Test::More tests => 8; |
| |
| use encoding 'utf8'; |
| |
| @@ -67,4 +67,12 @@ is($str, $hiragana, "s/// # hiragana -> katakana"); |
| $line =~ tr/bcdeghijklmnprstvwxyz$02578/בצדעגהיײקלמנפּרסטװשכיזשױתײחא/; |
| is($line, "aבצדעfגהיײקלמנoפqּרסuטװשכיזש1ױ34ת6ײח9", "[perl #16843]"); |
| } |
| + |
| +{ |
| + # [perl #40641] |
| + my $str = qq/Gebääääääääääääääääääääude/; |
| + my $reg = qr/Gebääääääääääääääääääääude/; |
| + ok($str =~ /$reg/, "[perl #40641]"); |
| +} |
| + |
| __END__ |
| diff --git a/utf8.h b/utf8.h |
| index 6d63897..3800866 100644 |
| --- a/utf8.h |
| +++ b/utf8.h |
| @@ -198,6 +198,8 @@ encoded character. |
| UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) |
| #define UTF8_ALLOW_ANY 0x00FF |
| #define UTF8_CHECK_ONLY 0x0200 |
| +#define UTF8_ALLOW_DEFAULT (ckWARN(WARN_UTF8) ? 0 : \ |
| + UTF8_ALLOW_ANYUV) |
| |
| #define UNICODE_SURROGATE_FIRST 0xD800 |
| #define UNICODE_SURROGATE_LAST 0xDFFF |