/* * Copyright (c) 1999, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions.
*/
/** * Converts ASCII alphabet characters [A-Za-z] in the given 's' to * supplementary characters. This method does NOT fully take care * of the regex syntax.
*/ publicstatic String toSupplementaries(String s) { int length = s.length();
StringBuilder sb = new StringBuilder(length * 2);
for (int i = 0; i < length; ) { char c = s.charAt(i++); if (c == '\\') {
sb.append(c); if (i < length) {
c = s.charAt(i++);
sb.append(c); if (c == 'u') { // assume no syntax error
sb.append(s.charAt(i++));
sb.append(s.charAt(i++));
sb.append(s.charAt(i++));
sb.append(s.charAt(i++));
}
}
} elseif ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
sb.append('\ud800').append((char)('\udc00'+c));
} else {
sb.append(c);
}
} return sb.toString();
}
// Regular expression tests //Following three tests execute from a file.
@Test publicstaticvoid processTestCases() throws IOException {
processFile("TestCases.txt");
}
// This is for bug6635133 // Test if surrogate pair in Unicode escapes can be handled correctly.
@Test publicstaticvoid surrogatesInClassTest() {
Pattern pattern = Pattern.compile("[\\ud834\\udd21-\\ud834\\udd24]");
Matcher matcher = pattern.matcher("\ud834\udd22");
assertTrue(matcher.find(), "Surrogate pair in Unicode escape");
}
// This is for bug6990617 // Test if Pattern.RemoveQEQuoting works correctly if the octal unicode // char encoding is only 2 or 3 digits instead of 4 and the first quoted // char is an octal digit.
@Test publicstaticvoid removeQEQuotingTest() {
Pattern pattern =
Pattern.compile("\\011\\Q1sometext\\E\\011\\Q2sometext\\E");
Matcher matcher = pattern.matcher("\t1sometext\t2sometext");
// This is for bug 4988891 // Test toMatchResult to see that it is a copy of the Matcher // that is not affected by subsequent operations on the original
@Test publicstaticvoid toMatchResultTest() {
Pattern pattern = Pattern.compile("squid");
Matcher matcher = pattern.matcher( "agiantsquidofdestinyasmallsquidoffate");
matcher.find();
int matcherStart1 = matcher.start();
MatchResult mr = matcher.toMatchResult();
assertNotSame(mr, matcher, "Matcher toMatchResult is identical object");
int resultStart1 = mr.start();
assertEquals(matcherStart1, resultStart1, "equal matchers don't have equal start indices");
matcher.find();
int matcherStart2 = matcher.start(); int resultStart2 = mr.start();
assertNotEquals(matcherStart2, resultStart2, "Matcher2 and Result2 should not be equal");
assertEquals(resultStart1, resultStart2, "Second match result should have the same state");
MatchResult mr2 = matcher.toMatchResult();
assertNotSame(mr, mr2, "Second Matcher copy should not be identical to the first.");
assertEquals(mr2.start(), matcherStart2, "mr2 index should equal matcher index");
}
// This is for bug 8074678 // Test the result of toMatchResult throws ISE if no match is availble
@Test publicstaticvoid toMatchResultTest2() {
Matcher matcher = Pattern.compile("nomatch").matcher("hello world");
matcher.find();
MatchResult mr = matcher.toMatchResult();
// This is for bug 5013885 // Must test a slice to see if it reports hitEnd correctly
@Test publicstaticvoid hitEndTest() { // Basic test of Slice node
Pattern p = Pattern.compile("^squidattack");
Matcher m = p.matcher("squack");
m.find();
assertFalse(m.hitEnd(), "Matcher should not be at end of sequence");
m.reset("squid");
m.find();
assertTrue(m.hitEnd(), "Matcher should be at the end of sequence");
// Test Slice, SliceA and SliceU nodes for (int i=0; i<3; i++) { int flags = 0; if (i==1) flags = Pattern.CASE_INSENSITIVE; if (i==2) flags = Pattern.UNICODE_CASE;
p = Pattern.compile("^abc", flags);
m = p.matcher("ad");
m.find();
assertFalse(m.hitEnd(), "Slice node test");
m.reset("ab");
m.find();
assertTrue(m.hitEnd(), "Slice node test");
}
// Test Boyer-Moore node
p = Pattern.compile("catattack");
m = p.matcher("attack");
m.find();
assertTrue(m.hitEnd(), "Boyer-Moore node test");
p = Pattern.compile("catattack");
m = p.matcher("attackattackattackcatatta");
m.find();
assertTrue(m.hitEnd(), "Boyer-More node test");
// 8184706: Matching u+0d at EOL against \R should hit-end
p = Pattern.compile("...\\R");
m = p.matcher("cat" + (char)0x0a);
m.find();
assertFalse(m.hitEnd());
m = p.matcher("cat" + (char)0x0d);
m.find();
assertTrue(m.hitEnd());
m = p.matcher("cat" + (char)0x0d + (char)0x0a);
m.find();
assertFalse(m.hitEnd());
}
// This is for bug 4997476 // It is weird code submitted by customer demonstrating a regression
@Test publicstaticvoid wordSearchTest() {
String testString = "word1 word2 word3";
Pattern p = Pattern.compile("\\b");
Matcher m = p.matcher(testString); int position = 0; int start; while (m.find(position)) {
start = m.start(); if (start == testString.length()) break; if (m.find(start+1)) {
position = m.start();
} else {
position = testString.length();
} if (testString.substring(start, position).equals(" ")) continue;
assertTrue(testString.substring(start, position-1).startsWith("word"));
}
}
// This is for bug 4994840
@Test publicstaticvoid caretAtEndTest() { // Problem only occurs with multiline patterns // containing a beginning-of-line caret "^" followed // by an expression that also matches the empty string.
Pattern pattern = Pattern.compile("^x?", Pattern.MULTILINE);
Matcher matcher = pattern.matcher("\r");
matcher.find();
matcher.find();
}
// This test is for 4979006 // Check to see if word boundary construct properly handles unicode // non spacing marks
@Test publicstaticvoid unicodeWordBoundsTest() {
String spaces = " ";
String wordChar = "a";
String nsm = "\u030a";
privatestaticvoid twoFindIndexes(String input, Matcher matcher, int a, int b)
{
matcher.reset(input);
matcher.find();
assertEquals(matcher.start(), a);
matcher.find();
assertEquals(matcher.start(), b);
}
// This test is for 6284152 privatestaticvoid check(String regex, String input, String[] expected) {
List<String> result = new ArrayList<>();
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(input); while (m.find()) {
result.add(m.group());
}
assertEquals(Arrays.asList(expected), result);
}
//boundary at end of the lookbehind sub-regex should work consistently //with the boundary just after the lookbehind sub-regex
check("(?<=.*\\b)foo", "abcd foo", new String[]{"foo"});
check("(?<=.*)\\bfoo", "abcd foo", new String[]{"foo"});
check("(?, "abc foo", new String[0]);
check("(?, "abc foo", new String[0]);
//Negative
check("(?, "%foo1\n%bar foo2\n%bar foo3\n%blahblah foo4\nfoo5", new String[] {"foo4", "foo5"});
//Positive greedy
check("(?<=%b{1,4})foo", "%bbbbfoo", new String[] {"foo"});
//Positive reluctant
check("(?<=%b{1,4}?)foo", "%bbbbfoo", new String[] {"foo"});
//supplementary
check("(?<=%b{1,4})fo\ud800\udc00o", "%bbbbfo\ud800\udc00o", new String[] {"fo\ud800\udc00o"});
check("(?<=%b{1,4}?)fo\ud800\udc00o", "%bbbbfo\ud800\udc00o", new String[] {"fo\ud800\udc00o"});
check("(?, "%afo\ud800\udc00o", new String[] {"fo\ud800\udc00o"});
check("(?, "%afo\ud800\udc00o", new String[] {"fo\ud800\udc00o"});
}
// This test is for 4938995 // Check to see if weak region boundaries are transparent to // lookahead and lookbehind constructs
@Test publicstaticvoid boundsTest() {
String fullMessage = "catdogcat";
Pattern pattern = Pattern.compile("(?<=cat)dog(?=cat)");
Matcher matcher = pattern.matcher("catdogca");
matcher.useTransparentBounds(true);
// This test is for 4945394
@Test publicstaticvoid findFromTest() {
String message = "This is 40 $0 message.";
Pattern pat = Pattern.compile("\\$0");
Matcher match = pat.matcher(message);
assertTrue(match.find());
assertFalse(match.find());
assertFalse(match.find());
}
// This test is for 4872664 and 4892980
@Test publicstaticvoid negatedCharClassTest() {
Pattern pattern = Pattern.compile("[^>]");
Matcher matcher = pattern.matcher("\u203A");
assertTrue(matcher.matches());
matcher.reset("\u203A");
assertTrue(matcher.find());
String s = "for";
String[] result = s.split("[^fr]");
assertEquals(result[0], "f");
assertEquals(result[1], "r");
s = "f\u203Ar";
result = s.split("[^fr]");
assertEquals(result[0], "f");
assertEquals(result[1], "r");
// Test adding to bits, subtracting a node, then adding to bits again
pattern = Pattern.compile("[^f\u203Ar]");
matcher = pattern.matcher("a");
assertTrue(matcher.find());
matcher.reset("f");
assertFalse(matcher.find());
matcher.reset("\u203A");
assertFalse(matcher.find());
matcher.reset("r");
assertFalse(matcher.find());
matcher.reset("\u203B");
assertTrue(matcher.find());
// Test subtracting a node, adding to bits, subtracting again
pattern = Pattern.compile("[^\u203Ar\u203B]");
matcher = pattern.matcher("a");
assertTrue(matcher.find());
matcher.reset("\u203A");
assertFalse(matcher.find());
matcher.reset("r");
assertFalse(matcher.find());
matcher.reset("\u203B");
assertFalse(matcher.find());
matcher.reset("\u203C");
assertTrue(matcher.find());
}
// This test is for 4628291
@Test publicstaticvoid toStringTest() {
Pattern pattern = Pattern.compile("b+");
assertEquals(pattern.toString(), "b+");
Matcher matcher = pattern.matcher("aaabbbccc");
String matcherString = matcher.toString(); // unspecified
matcher.find();
matcher.toString(); // unspecified
matcher.region(0,3);
matcher.toString(); // unspecified
matcher.reset();
matcher.toString(); // unspecified
}
// This test is for 4808962
@Test publicstaticvoid literalPatternTest() { int flags = Pattern.LITERAL;
// note: this is case-sensitive.
pattern = Pattern.compile(toSupplementaries("a...b"), flags);
check(pattern, toSupplementaries("a...b"), true);
check(pattern, toSupplementaries("axxxb"), false);
flags |= Pattern.CANON_EQ;
String t = toSupplementaries("test"); //Note: Possible issue
p = Pattern.compile(t + "a\u030a", flags);
check(pattern, t + "a\u030a", false);
check(pattern, t + "\u00e5", false);
}
// This test is for 4803179 // This test is also for 4808962, replacement parts
@Test publicstaticvoid literalReplacementTest() { int flags = Pattern.LITERAL;
// IAE should be thrown if backslash or '$' is the last character // in replacement string
assertThrows(IllegalArgumentException.class, () -> "\uac00".replaceAll("\uac00", "$"));
assertThrows(IllegalArgumentException.class, () -> "\uac00".replaceAll("\uac00", "\\"));
}
// This test is for 4757029
@Test publicstaticvoid regionTest() {
Pattern pattern = Pattern.compile("abc");
Matcher matcher = pattern.matcher("abcdefabc");
// Supplementary character test
pattern = Pattern.compile(toSupplementaries("\\Qdir1\\dir2\\E"));
check(pattern, toSupplementaries("dir1\\dir2"), true);
// This test is for 4792284
@Test publicstaticvoid nonCaptureRepetitionTest() {
String input = "abcdefgh;";
String[] patterns = new String[] { "(?:\\w{4})+;", "(?:\\w{8})*;", "(?:\\w{2}){2,4};", "(?:\\w{4}){2,};", // only matches the ".*?(?:\\w{5})+;", // specified minimum ".*?(?:\\w{9})*;", // number of reps - OK "(?:\\w{4})+?;", // lazy repetition - OK "(?:\\w{4})++;", // possessive repetition - OK "(?:\\w{2,}?)+;", // non-deterministic - OK "(\\w{4})+;", // capturing group - OK
};
for (String pattern : patterns) { // Check find()
check(pattern, 0, input, input, true); // Check matches()
Pattern p = Pattern.compile(pattern);
Matcher m = p.matcher(input);
// This test is for 6358731
@Test publicstaticvoid notCapturedGroupCurlyMatchTest() {
Pattern pattern = Pattern.compile("(abc)+|(abcd)+");
Matcher matcher = pattern.matcher("abcd");
// This test is for 4523620 /* private static void numOccurrencesTest() throws Exception { Pattern pattern = Pattern.compile("aaa");
if (pattern.numOccurrences("aaaaaa", false) != 2) failCount++; if (pattern.numOccurrences("aaaaaa", true) != 4) failCount++;
pattern = Pattern.compile("^"); if (pattern.numOccurrences("aaaaaa", false) != 1) failCount++; if (pattern.numOccurrences("aaaaaa", true) != 1) failCount++;
report("Number of Occurrences"); }
*/
// This test is for 4776374
@Test publicstaticvoid caretBetweenTerminatorsTest() { int flags1 = Pattern.DOTALL; int flags2 = Pattern.DOTALL | Pattern.UNIX_LINES; int flags3 = Pattern.DOTALL | Pattern.UNIX_LINES | Pattern.MULTILINE; int flags4 = Pattern.DOTALL | Pattern.MULTILINE;
// This test is for 4727935
@Test publicstaticvoid dollarAtEndTest() { int flags1 = Pattern.DOTALL; int flags2 = Pattern.DOTALL | Pattern.UNIX_LINES; int flags3 = Pattern.DOTALL | Pattern.MULTILINE;
// This test is for 4711773
@Test publicstaticvoid multilineDollarTest() {
Pattern findCR = Pattern.compile("$", Pattern.MULTILINE);
Matcher matcher = findCR.matcher("first bit\nsecond bit");
matcher.find();
assertEquals(matcher.start(), 9);
matcher.find();
assertEquals(matcher.start(0), 20);
// Supplementary character test
matcher = findCR.matcher(toSupplementaries("first bit\n second bit")); // double BMP chars
matcher.find();
assertEquals(matcher.start(0), 9*2);
matcher.find();
assertEquals(matcher.start(0), 20*2);
}
@Test publicstaticvoid reluctantRepetitionTest() {
Pattern p = Pattern.compile("1(\\s\\S+?){1,3}?[\\s,]2");
check(p, "1 word word word 2", true);
check(p, "1 wor wo w 2", true);
check(p, "1 word word 2", true);
check(p, "1 word 2", true);
check(p, "1 wo w w 2", true);
check(p, "1 wo w 2", true);
check(p, "1 wor w 2", true);
p = Pattern.compile("([a-z])+?c");
Matcher m = p.matcher("ababcdefdec");
check(m, "ababc");
// Supplementary character test
p = Pattern.compile(toSupplementaries("([a-z])+?c"));
m = p.matcher(toSupplementaries("ababcdefdec"));
check(m, toSupplementaries("ababc"));
}
publicstatic Pattern serializedPattern(Pattern p) throws Exception {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ObjectOutputStream oos = new ObjectOutputStream(baos);
oos.writeObject(p);
oos.close(); try (ObjectInputStream ois = new ObjectInputStream( new ByteArrayInputStream(baos.toByteArray()))) { return (Pattern)ois.readObject();
}
}
// Supplementary character test
pattern = Pattern.compile(toSupplementaries("(ab)(c*)"));
matcher = pattern.matcher(toSupplementaries("abccczzzabcczzzabccc"));
result = matcher.replaceFirst(toSupplementaries("test"));
assertEquals(result, toSupplementaries("testzzzabcczzzabccc"));
matcher.reset(toSupplementaries("zzzabccczzzabcczzzabccczzz"));
result = matcher.replaceFirst(toSupplementaries("test"));
assertEquals(result, toSupplementaries("zzztestzzzabcczzzabccczzz"));
matcher.reset(toSupplementaries("zzzabccczzzabcczzzabccczzz"));
result = matcher.replaceFirst("$1");
assertEquals(result, toSupplementaries("zzzabzzzabcczzzabccczzz"));
matcher.reset(toSupplementaries("zzzabccczzzabcczzzabccczzz"));
result = matcher.replaceFirst("$2");
assertEquals(result, toSupplementaries("zzzccczzzabcczzzabccczzz"));
flags = Pattern.CASE_INSENSITIVE; for (int i = 0; i < patterns.length; i++) {
pattern = Pattern.compile(patterns[i], flags);
matcher = pattern.matcher(texts[i]);
assertEquals(matcher.matches(), expected[i], "<1> Failed at " + i);
}
flags = Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE; for (int i = 0; i < patterns.length; i++) {
pattern = Pattern.compile(patterns[i], flags);
matcher = pattern.matcher(texts[i]);
assertTrue(matcher.matches(), "<2> Failed at " + i);
} // flag unicode_case alone should do nothing
flags = Pattern.UNICODE_CASE; for (int i = 0; i < patterns.length; i++) {
pattern = Pattern.compile(patterns[i], flags);
matcher = pattern.matcher(texts[i]);
assertFalse(matcher.matches(), "<3> Failed at " + i);
}
// Special cases: i, I, u+0131 and u+0130
flags = Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE;
pattern = Pattern.compile("[h-j]+", flags);
assertTrue(pattern.matcher("\u0131\u0130").matches());
}
if (matcher.find()) {
StringBuffer sb = new StringBuffer();
matcher.appendReplacement(sb, r);
matcher.appendTail(sb);
result = sb.toString();
assertEquals(result, "Swap one: 123 = first, second = 456");
}
// Supplementary character test
pattern = Pattern.compile(toSupplementaries("(ab)(cd)"));
matcher = pattern.matcher(toSupplementaries("abcd"));
result = matcher.replaceAll("$2$1");
assertEquals(result, toSupplementaries("cdab"));
s1 = toSupplementaries("Swap all: first = 123, second = 456");
s2 = toSupplementaries("Swap one: first = 123, second = 456");
r = toSupplementaries("$3$2$1");
pattern = Pattern.compile(toSupplementaries("([a-z]+)( *= *)([0-9]+)"));
matcher = pattern.matcher(s1);
// Supplementary character test
CharBuffer cbs = CharBuffer.allocate(100);
cbs.put(toSupplementaries("fooXandXboo"));
cbs.flip();
result = patternX.split(cbs);
assertEquals(result[0], toSupplementaries("foo"));
assertEquals(result[1], toSupplementaries("and"));
assertEquals(result[2], toSupplementaries("boo"));
String source = "0123456789"; for (int limit=-2; limit<3; limit++) { for (int x=0; x<10; x++) {
result = source.split(Integer.toString(x), limit); int expectedLength = limit < 1 ? 2 : limit;
if (!result[0].equals(source.substring(0,x))) {
assertEquals(limit, 1);
assertEquals(result[0], source.substring(0,10));
} if (expectedLength > 1) { // Check segment 2
assertEquals(result[1], source.substring(x+1,10));
}
}
}
} // Check the case for no match found for (int limit=-2; limit<3; limit++) {
result = source.split("e", limit);
assertEquals(result.length, 1);
assertEquals(result[0], source);
} // Check the case for limit == 0, source = ""; // split() now returns 0-length for empty source "" see #6559590
source = "";
result = source.split("e", 0);
assertEquals(result.length, 1);
assertEquals(result[0], source);
// Check both split() and splitAsStraem(), especially for zero-lenth // input and zero-lenth match cases
String[][] input = new String[][] {
{ " ", "Abc Efg Hij" }, // normal non-zero-match
{ " ", " Abc Efg Hij" }, // leading empty str for non-zero-match
{ " ", "Abc Efg Hij" }, // non-zero-match in the middle
{ "(?=\\p{Lu})", "AbcEfgHij" }, // no leading empty str for zero-match
{ "(?=\\p{Lu})", "AbcEfg" },
{ "(?=\\p{Lu})", "Abc" },
{ " ", "" }, // zero-length input
{ ".*", "" },