From 1e18ddcc98f318b36449102f3267fd4631cc668b Mon Sep 17 00:00:00 2001 From: Gabriel Belingueres Date: Sat, 3 Apr 2021 21:25:38 -0300 Subject: [PATCH] Fix MXParser improve error reporting (#136) (#137) - when parsing large char entities. - when mixing invalid encoding declarations and file encodings. --- .../plexus/util/xml/pull/MXParser.java | 51 +++- ..._BjoernHoehrmannviaHST2013_09_18_Test.java | 278 ++++++++++++++++++ src/test/resources/xmlconf/eduni/misc/001.xml | 4 + src/test/resources/xmlconf/eduni/misc/002.xml | 4 + src/test/resources/xmlconf/eduni/misc/003.xml | 4 + src/test/resources/xmlconf/eduni/misc/004.xml | 4 + src/test/resources/xmlconf/eduni/misc/005.xml | 2 + src/test/resources/xmlconf/eduni/misc/006.xml | 2 + src/test/resources/xmlconf/eduni/misc/007.xml | 1 + src/test/resources/xmlconf/eduni/misc/008.xml | Bin 0 -> 86 bytes src/test/resources/xmlconf/eduni/misc/009.xml | 1 + .../resources/xmlconf/eduni/misc/ht-bh.xml | 37 +++ .../resources/xmlconf/eduni/misc/xmlconf.xml | 19 ++ 13 files changed, 399 insertions(+), 8 deletions(-) create mode 100644 src/test/java/org/codehaus/plexus/util/xml/pull/eduni_misc_Test_BjoernHoehrmannviaHST2013_09_18_Test.java create mode 100644 src/test/resources/xmlconf/eduni/misc/001.xml create mode 100644 src/test/resources/xmlconf/eduni/misc/002.xml create mode 100644 src/test/resources/xmlconf/eduni/misc/003.xml create mode 100644 src/test/resources/xmlconf/eduni/misc/004.xml create mode 100644 src/test/resources/xmlconf/eduni/misc/005.xml create mode 100644 src/test/resources/xmlconf/eduni/misc/006.xml create mode 100644 src/test/resources/xmlconf/eduni/misc/007.xml create mode 100644 src/test/resources/xmlconf/eduni/misc/008.xml create mode 100644 src/test/resources/xmlconf/eduni/misc/009.xml create mode 100644 src/test/resources/xmlconf/eduni/misc/ht-bh.xml create mode 100644 src/test/resources/xmlconf/eduni/misc/xmlconf.xml diff --git a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java index 430c2236..4ce9bf0c 100644 --- a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java +++ b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java @@ -11,6 +11,7 @@ import java.io.EOFException; import java.io.IOException; +import java.io.InputStreamReader; import java.io.Reader; import java.io.UnsupportedEncodingException; @@ -122,6 +123,8 @@ private String newStringIntern( char[] cbuf, int off, int len ) // private String elValue[]; private int elNamespaceCount[]; + private String fileEncoding = "UTF8"; + /** * Make sure that we have enough space to keep element stack if passed size. It will always create one additional * slot then current depth @@ -659,6 +662,15 @@ public void setInput( Reader in ) { reset(); reader = in; + + if ( reader instanceof InputStreamReader ) + { + InputStreamReader isr = (InputStreamReader) reader; + if ( isr.getEncoding() != null ) + { + fileEncoding = isr.getEncoding().toUpperCase(); + } + } } @Override @@ -1771,6 +1783,17 @@ private int parseProlog() // skipping UNICODE int Order Mark (so called BOM) ch = more(); } + else if ( ch == '\uFFFD' ) + { + // UTF-16 BOM in an UTF-8 encoded file? + // This is a hack...not the best way to check for BOM in UTF-16 + ch = more(); + if ( ch == '\uFFFD' ) + { + throw new XmlPullParserException( "UTF-16 BOM in a UTF-8 encoded file is incompatible", this, + null ); + } + } } seenMarkup = false; boolean gotS = false; @@ -2723,18 +2746,19 @@ else if ( ch >= 'A' && ch <= 'F' ) } posEnd = pos - 1; - int codePoint = Integer.parseInt( sb.toString(), isHex ? 16 : 10 ); - boolean isValidCodePoint = isValidCodePoint( codePoint ); - if ( isValidCodePoint ) + boolean isValidCodePoint = true; + try { - try + int codePoint = Integer.parseInt( sb.toString(), isHex ? 16 : 10 ); + isValidCodePoint = isValidCodePoint( codePoint ); + if ( isValidCodePoint ) { charRefOneCharBuf = Character.toChars( codePoint ); } - catch ( IllegalArgumentException e ) - { - isValidCodePoint = false; - } + } + catch ( IllegalArgumentException e ) + { + isValidCodePoint = false; } if ( !isValidCodePoint ) @@ -3328,6 +3352,17 @@ private void parseXmlDeclWithVersion( int versionStart, int versionEnd ) // TODO reconcile with setInput encodingName inputEncoding = newString( buf, encodingStart, encodingEnd - encodingStart ); + + if ( "UTF8".equals( fileEncoding ) && inputEncoding.toUpperCase().startsWith( "ISO-" ) ) + { + throw new XmlPullParserException( "UTF-8 BOM plus xml decl of " + inputEncoding + " is incompatible", + this, null ); + } + else if ("UTF-16".equals( fileEncoding ) && inputEncoding.equalsIgnoreCase( "UTF-8" )) + { + throw new XmlPullParserException( "UTF-16 BOM plus xml decl of " + inputEncoding + " is incompatible", + this, null ); + } } ch = more(); diff --git a/src/test/java/org/codehaus/plexus/util/xml/pull/eduni_misc_Test_BjoernHoehrmannviaHST2013_09_18_Test.java b/src/test/java/org/codehaus/plexus/util/xml/pull/eduni_misc_Test_BjoernHoehrmannviaHST2013_09_18_Test.java new file mode 100644 index 00000000..cf1fe16a --- /dev/null +++ b/src/test/java/org/codehaus/plexus/util/xml/pull/eduni_misc_Test_BjoernHoehrmannviaHST2013_09_18_Test.java @@ -0,0 +1,278 @@ +package org.codehaus.plexus.util.xml.pull; + +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.StandardCharsets; + +import org.junit.Before; +import org.junit.Test; + +/** + * Test class that execute a particular set of tests associated to a TESCASES tag from the XML W3C Conformance Tests. + * TESCASES PROFILE:
Bjoern Hoehrmann via HST 2013-09-18
+ * XML test files base folder:
xmlconf/eduni/misc/
+ * + * @author Gabriel Belingueres + */ +public class eduni_misc_Test_BjoernHoehrmannviaHST2013_09_18_Test +{ + + final static File testResourcesDir = new File("src/test/resources/", "xmlconf/eduni/misc/"); + + MXParser parser; + + @Before + public void setUp() + { + parser = new MXParser(); + } + + /** + * Test ID:
hst-bh-001
+ * Test URI:
001.xml
+ * Comment:
decimal charref &#62; 10FFFF, indeed &#62; max 32 bit integer, checking for recovery from possible overflow
+ * Sections:
2.2 [2], 4.1 [66]
+ * Version: + * + * @throws IOException if there is an I/O error + */ + @Test + public void testhst_bh_001() + throws IOException + { + try ( Reader reader = new FileReader( new File( testResourcesDir, "001.xml" ) ) ) + { + parser.setInput( reader ); + while ( parser.nextToken() != XmlPullParser.END_DOCUMENT ) + ; + fail( "decimal charref > 10FFFF, indeed > max 32 bit integer, checking for recovery from possible overflow" ); + } + catch ( XmlPullParserException e ) + { + assertTrue( e.getMessage().contains( "character reference (with hex value FF000000F6) is invalid" ) ); + } + } + + /** + * Test ID:
hst-bh-002
+ * Test URI:
002.xml
+ * Comment:
hex charref &#62; 10FFFF, indeed &#62; max 32 bit integer, checking for recovery from possible overflow
+ * Sections:
2.2 [2], 4.1 [66]
+ * Version: + * + * @throws IOException if there is an I/O error + */ + @Test + public void testhst_bh_002() + throws IOException + { + try ( Reader reader = new FileReader( new File( testResourcesDir, "002.xml" ) ) ) + { + parser.setInput( reader ); + while ( parser.nextToken() != XmlPullParser.END_DOCUMENT ) + ; + fail( "hex charref > 10FFFF, indeed > max 32 bit integer, checking for recovery from possible overflow" ); + } + catch ( XmlPullParserException e ) + { + assertTrue( e.getMessage().contains( "character reference (with decimal value 4294967542) is invalid" ) ); + } + } + + /** + * Test ID:
hst-bh-003
+ * Test URI:
003.xml
+ * Comment:
decimal charref &#62; 10FFFF, indeed &#62; max 64 bit integer, checking for recovery from possible overflow
+ * Sections:
2.2 [2], 4.1 [66]
+ * Version: + * + * @throws IOException if there is an I/O error + */ + @Test + public void testhst_bh_003() + throws IOException + { + try ( Reader reader = new FileReader( new File( testResourcesDir, "003.xml" ) ) ) + { + parser.setInput( reader ); + while ( parser.nextToken() != XmlPullParser.END_DOCUMENT ) + ; + fail( "decimal charref > 10FFFF, indeed > max 64 bit integer, checking for recovery from possible overflow" ); + } + catch ( XmlPullParserException e ) + { + assertTrue( e.getMessage().contains( "character reference (with hex value FFFFFFFF000000F6) is invalid" ) ); + } + } + + /** + * Test ID:
hst-bh-004
+ * Test URI:
004.xml
+ * Comment:
hex charref &#62; 10FFFF, indeed &#62; max 64 bit integer, checking for recovery from possible overflow
+ * Sections:
2.2 [2], 4.1 [66]
+ * Version: + * + * @throws IOException if there is an I/O error + */ + @Test + public void testhst_bh_004() + throws IOException + { + try ( Reader reader = new FileReader( new File( testResourcesDir, "004.xml" ) ) ) + { + parser.setInput( reader ); + while ( parser.nextToken() != XmlPullParser.END_DOCUMENT ) + ; + fail( "hex charref > 10FFFF, indeed > max 64 bit integer, checking for recovery from possible overflow" ); + } + catch ( XmlPullParserException e ) + { + assertTrue( e.getMessage().contains( "character reference (with decimal value 18446744073709551862) is invalid" ) ); + } + } + + /** + * Test ID:
hst-bh-005
+ * Test URI:
005.xml
+ * Comment:
xmlns:xml is an attribute as far as validation is concerned and must be declared
+ * Sections:
3.1 [41]
+ * Version: + * + * @throws IOException if there is an I/O error + * + * NOTE: This test is SKIPPED as MXParser do not supports DOCDECL parsing. + */ + // @Test + public void testhst_bh_005() + throws IOException + { + try ( Reader reader = new FileReader( new File( testResourcesDir, "005.xml" ) ) ) + { + parser.setInput( reader ); + while ( parser.nextToken() != XmlPullParser.END_DOCUMENT ) + ; + fail( "xmlns:xml is an attribute as far as validation is concerned and must be declared" ); + } + catch ( XmlPullParserException e ) + { + assertTrue( true ); + } + } + + /** + * Test ID:
hst-bh-006
+ * Test URI:
006.xml
+ * Comment:
xmlns:foo is an attribute as far as validation is concerned and must be declared
+ * Sections:
3.1 [41]
+ * Version: + * + * @throws IOException if there is an I/O error + * + * NOTE: This test is SKIPPED as MXParser do not supports DOCDECL parsing. + */ + // @Test + public void testhst_bh_006() + throws IOException + { + try ( Reader reader = new FileReader( new File( testResourcesDir, "006.xml" ) ) ) + { + parser.setInput( reader ); + while ( parser.nextToken() != XmlPullParser.END_DOCUMENT ) + ; + fail( "xmlns:foo is an attribute as far as validation is concerned and must be declared" ); + } + catch ( XmlPullParserException e ) + { + assertTrue( true ); + } + } + + /** + * Test ID:
hst-lhs-007
+ * Test URI:
007.xml
+ * Comment:
UTF-8 BOM plus xml decl of iso-8859-1 incompatible
+ * Sections:
4.3.3
+ * Version: + * + * @throws IOException if there is an I/O error + */ + @Test + public void testhst_lhs_007() + throws IOException + { + try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "007.xml" ) ); + InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) ) + { + parser.setInput( reader ); + while ( parser.nextToken() != XmlPullParser.END_DOCUMENT ) + ; + fail( "UTF-8 BOM plus xml decl of iso-8859-1 incompatible" ); + } + catch ( XmlPullParserException e ) + { + assertTrue( e.getMessage().contains( "UTF-8 BOM plus xml decl of iso-8859-1 is incompatible" ) ); + } + } + + /** + * Test ID:
hst-lhs-008
+ * Test URI:
008.xml
+ * Comment:
UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible
+ * Sections:
4.3.3
+ * Version: + * + * @throws IOException if there is an I/O error + */ + @Test + public void testhst_lhs_008() + throws IOException + { + try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "008.xml" ) ); + InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_16 ) ) + { + parser.setInput( reader ); + while ( parser.nextToken() != XmlPullParser.END_DOCUMENT ) + ; + fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible" ); + } + catch ( XmlPullParserException e ) + { + assertTrue( e.getMessage().contains( "UTF-16 BOM plus xml decl of utf-8 is incompatible" ) ); + } + } + + /** + * Test ID:
hst-lhs-009
+ * Test URI:
009.xml
+ * Comment:
UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible
+ * Sections:
4.3.3
+ * Version: + * + * @throws IOException if there is an I/O error + */ + @Test + public void testhst_lhs_009() + throws IOException + { + try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "009.xml" ) ); + InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) ) + { + parser.setInput( reader ); + while ( parser.nextToken() != XmlPullParser.END_DOCUMENT ) + ; + fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible" ); + } + catch ( XmlPullParserException e ) + { + assertTrue( e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) ); + } + } + +} \ No newline at end of file diff --git a/src/test/resources/xmlconf/eduni/misc/001.xml b/src/test/resources/xmlconf/eduni/misc/001.xml new file mode 100644 index 00000000..76de9900 --- /dev/null +++ b/src/test/resources/xmlconf/eduni/misc/001.xml @@ -0,0 +1,4 @@ + +]> +

Fa�il

diff --git a/src/test/resources/xmlconf/eduni/misc/002.xml b/src/test/resources/xmlconf/eduni/misc/002.xml new file mode 100644 index 00000000..943d284e --- /dev/null +++ b/src/test/resources/xmlconf/eduni/misc/002.xml @@ -0,0 +1,4 @@ + +]> +

Fa�il

diff --git a/src/test/resources/xmlconf/eduni/misc/003.xml b/src/test/resources/xmlconf/eduni/misc/003.xml new file mode 100644 index 00000000..c2fb6990 --- /dev/null +++ b/src/test/resources/xmlconf/eduni/misc/003.xml @@ -0,0 +1,4 @@ + +]> +

Fa�il

diff --git a/src/test/resources/xmlconf/eduni/misc/004.xml b/src/test/resources/xmlconf/eduni/misc/004.xml new file mode 100644 index 00000000..1e83a946 --- /dev/null +++ b/src/test/resources/xmlconf/eduni/misc/004.xml @@ -0,0 +1,4 @@ + +]> +

Fa�il

diff --git a/src/test/resources/xmlconf/eduni/misc/005.xml b/src/test/resources/xmlconf/eduni/misc/005.xml new file mode 100644 index 00000000..d353623a --- /dev/null +++ b/src/test/resources/xmlconf/eduni/misc/005.xml @@ -0,0 +1,2 @@ + ]> + diff --git a/src/test/resources/xmlconf/eduni/misc/006.xml b/src/test/resources/xmlconf/eduni/misc/006.xml new file mode 100644 index 00000000..5234f760 --- /dev/null +++ b/src/test/resources/xmlconf/eduni/misc/006.xml @@ -0,0 +1,2 @@ + ]> + diff --git a/src/test/resources/xmlconf/eduni/misc/007.xml b/src/test/resources/xmlconf/eduni/misc/007.xml new file mode 100644 index 00000000..2da5d51b --- /dev/null +++ b/src/test/resources/xmlconf/eduni/misc/007.xml @@ -0,0 +1 @@ + diff --git a/src/test/resources/xmlconf/eduni/misc/008.xml b/src/test/resources/xmlconf/eduni/misc/008.xml new file mode 100644 index 0000000000000000000000000000000000000000..ef5f345f759d0efa78d93c825403d4835ad5c969 GIT binary patch literal 86 zcmW;EI|_h66hqOwlC-`GluLi3?PRR4p5=g8pIBp_42QwQ+M@CIS ZrF0UbS3gLN>DP;sk(@@0fY(=|#DDSe4^998 literal 0 HcmV?d00001 diff --git a/src/test/resources/xmlconf/eduni/misc/009.xml b/src/test/resources/xmlconf/eduni/misc/009.xml new file mode 100644 index 00000000..8c786226 --- /dev/null +++ b/src/test/resources/xmlconf/eduni/misc/009.xml @@ -0,0 +1 @@ +þÿ diff --git a/src/test/resources/xmlconf/eduni/misc/ht-bh.xml b/src/test/resources/xmlconf/eduni/misc/ht-bh.xml new file mode 100644 index 00000000..bd238312 --- /dev/null +++ b/src/test/resources/xmlconf/eduni/misc/ht-bh.xml @@ -0,0 +1,37 @@ + + + +decimal charref > 10FFFF, indeed > max 32 bit integer, checking for recovery +from possible overflow + + +hex charref > 10FFFF, indeed > max 32 bit integer, checking for recovery +from possible overflow + + +decimal charref > 10FFFF, indeed > max 64 bit integer, checking for recovery +from possible overflow + + +hex charref > 10FFFF, indeed > max 64 bit integer, checking for recovery +from possible overflow + + +xmlns:xml is an attribute as far as validation is concerned and must +be declared + + +xmlns:foo is an attribute as far as validation is concerned and must +be declared + + +UTF-8 BOM plus xml decl of iso-8859-1 incompatible + + +UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible + + +UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible + + + diff --git a/src/test/resources/xmlconf/eduni/misc/xmlconf.xml b/src/test/resources/xmlconf/eduni/misc/xmlconf.xml new file mode 100644 index 00000000..f42f5dc3 --- /dev/null +++ b/src/test/resources/xmlconf/eduni/misc/xmlconf.xml @@ -0,0 +1,19 @@ + + + + +] > + + + &eduni-misc; + + +