Skip to content

Commit

Permalink
Fix MXParser improve error reporting (#136) (#137)
Browse files Browse the repository at this point in the history
- when parsing large char entities.
- when mixing invalid encoding declarations and file encodings.
  • Loading branch information
belingueres committed Apr 4, 2021
1 parent 48e444f commit 1e18ddc
Show file tree
Hide file tree
Showing 13 changed files with 399 additions and 8 deletions.
51 changes: 43 additions & 8 deletions src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java
Expand Up @@ -11,6 +11,7 @@

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;

Expand Down Expand Up @@ -122,6 +123,8 @@ private String newStringIntern( char[] cbuf, int off, int len )
// private String elValue[];
private int elNamespaceCount[];

private String fileEncoding = "UTF8";

/**
* Make sure that we have enough space to keep element stack if passed size. It will always create one additional
* slot then current depth
Expand Down Expand Up @@ -659,6 +662,15 @@ public void setInput( Reader in )
{
reset();
reader = in;

if ( reader instanceof InputStreamReader )
{
InputStreamReader isr = (InputStreamReader) reader;
if ( isr.getEncoding() != null )
{
fileEncoding = isr.getEncoding().toUpperCase();
}
}
}

@Override
Expand Down Expand Up @@ -1771,6 +1783,17 @@ private int parseProlog()
// skipping UNICODE int Order Mark (so called BOM)
ch = more();
}
else if ( ch == '\uFFFD' )
{
// UTF-16 BOM in an UTF-8 encoded file?
// This is a hack...not the best way to check for BOM in UTF-16
ch = more();
if ( ch == '\uFFFD' )
{
throw new XmlPullParserException( "UTF-16 BOM in a UTF-8 encoded file is incompatible", this,
null );
}
}
}
seenMarkup = false;
boolean gotS = false;
Expand Down Expand Up @@ -2723,18 +2746,19 @@ else if ( ch >= 'A' && ch <= 'F' )
}
posEnd = pos - 1;

int codePoint = Integer.parseInt( sb.toString(), isHex ? 16 : 10 );
boolean isValidCodePoint = isValidCodePoint( codePoint );
if ( isValidCodePoint )
boolean isValidCodePoint = true;
try
{
try
int codePoint = Integer.parseInt( sb.toString(), isHex ? 16 : 10 );
isValidCodePoint = isValidCodePoint( codePoint );
if ( isValidCodePoint )
{
charRefOneCharBuf = Character.toChars( codePoint );
}
catch ( IllegalArgumentException e )
{
isValidCodePoint = false;
}
}
catch ( IllegalArgumentException e )
{
isValidCodePoint = false;
}

if ( !isValidCodePoint )
Expand Down Expand Up @@ -3328,6 +3352,17 @@ private void parseXmlDeclWithVersion( int versionStart, int versionEnd )

// TODO reconcile with setInput encodingName
inputEncoding = newString( buf, encodingStart, encodingEnd - encodingStart );

if ( "UTF8".equals( fileEncoding ) && inputEncoding.toUpperCase().startsWith( "ISO-" ) )
{
throw new XmlPullParserException( "UTF-8 BOM plus xml decl of " + inputEncoding + " is incompatible",
this, null );
}
else if ("UTF-16".equals( fileEncoding ) && inputEncoding.equalsIgnoreCase( "UTF-8" ))
{
throw new XmlPullParserException( "UTF-16 BOM plus xml decl of " + inputEncoding + " is incompatible",
this, null );
}
}

ch = more();
Expand Down
@@ -0,0 +1,278 @@
package org.codehaus.plexus.util.xml.pull;

import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;

import org.junit.Before;
import org.junit.Test;

/**
* Test class that execute a particular set of tests associated to a TESCASES tag from the XML W3C Conformance Tests.
* TESCASES PROFILE: <pre>Bjoern Hoehrmann via HST 2013-09-18</pre>
* XML test files base folder: <pre>xmlconf/eduni/misc/</pre>
*
* @author <a href="mailto:belingueres@gmail.com">Gabriel Belingueres</a>
*/
public class eduni_misc_Test_BjoernHoehrmannviaHST2013_09_18_Test
{

final static File testResourcesDir = new File("src/test/resources/", "xmlconf/eduni/misc/");

MXParser parser;

@Before
public void setUp()
{
parser = new MXParser();
}

/**
* Test ID: <pre>hst-bh-001</pre>
* Test URI: <pre>001.xml</pre>
* Comment: <pre>decimal charref &#38;#62; 10FFFF, indeed &#38;#62; max 32 bit integer, checking for recovery from possible overflow</pre>
* Sections: <pre>2.2 [2], 4.1 [66]</pre>
* Version:
*
* @throws IOException if there is an I/O error
*/
@Test
public void testhst_bh_001()
throws IOException
{
try ( Reader reader = new FileReader( new File( testResourcesDir, "001.xml" ) ) )
{
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "decimal charref > 10FFFF, indeed > max 32 bit integer, checking for recovery from possible overflow" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "character reference (with hex value FF000000F6) is invalid" ) );
}
}

/**
* Test ID: <pre>hst-bh-002</pre>
* Test URI: <pre>002.xml</pre>
* Comment: <pre>hex charref &#38;#62; 10FFFF, indeed &#38;#62; max 32 bit integer, checking for recovery from possible overflow</pre>
* Sections: <pre>2.2 [2], 4.1 [66]</pre>
* Version:
*
* @throws IOException if there is an I/O error
*/
@Test
public void testhst_bh_002()
throws IOException
{
try ( Reader reader = new FileReader( new File( testResourcesDir, "002.xml" ) ) )
{
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "hex charref > 10FFFF, indeed > max 32 bit integer, checking for recovery from possible overflow" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "character reference (with decimal value 4294967542) is invalid" ) );
}
}

/**
* Test ID: <pre>hst-bh-003</pre>
* Test URI: <pre>003.xml</pre>
* Comment: <pre>decimal charref &#38;#62; 10FFFF, indeed &#38;#62; max 64 bit integer, checking for recovery from possible overflow</pre>
* Sections: <pre>2.2 [2], 4.1 [66]</pre>
* Version:
*
* @throws IOException if there is an I/O error
*/
@Test
public void testhst_bh_003()
throws IOException
{
try ( Reader reader = new FileReader( new File( testResourcesDir, "003.xml" ) ) )
{
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "decimal charref > 10FFFF, indeed > max 64 bit integer, checking for recovery from possible overflow" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "character reference (with hex value FFFFFFFF000000F6) is invalid" ) );
}
}

/**
* Test ID: <pre>hst-bh-004</pre>
* Test URI: <pre>004.xml</pre>
* Comment: <pre>hex charref &#38;#62; 10FFFF, indeed &#38;#62; max 64 bit integer, checking for recovery from possible overflow</pre>
* Sections: <pre>2.2 [2], 4.1 [66]</pre>
* Version:
*
* @throws IOException if there is an I/O error
*/
@Test
public void testhst_bh_004()
throws IOException
{
try ( Reader reader = new FileReader( new File( testResourcesDir, "004.xml" ) ) )
{
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "hex charref > 10FFFF, indeed > max 64 bit integer, checking for recovery from possible overflow" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "character reference (with decimal value 18446744073709551862) is invalid" ) );
}
}

/**
* Test ID: <pre>hst-bh-005</pre>
* Test URI: <pre>005.xml</pre>
* Comment: <pre>xmlns:xml is an attribute as far as validation is concerned and must be declared</pre>
* Sections: <pre>3.1 [41]</pre>
* Version:
*
* @throws IOException if there is an I/O error
*
* NOTE: This test is SKIPPED as MXParser do not supports DOCDECL parsing.
*/
// @Test
public void testhst_bh_005()
throws IOException
{
try ( Reader reader = new FileReader( new File( testResourcesDir, "005.xml" ) ) )
{
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "xmlns:xml is an attribute as far as validation is concerned and must be declared" );
}
catch ( XmlPullParserException e )
{
assertTrue( true );
}
}

/**
* Test ID: <pre>hst-bh-006</pre>
* Test URI: <pre>006.xml</pre>
* Comment: <pre>xmlns:foo is an attribute as far as validation is concerned and must be declared</pre>
* Sections: <pre>3.1 [41]</pre>
* Version:
*
* @throws IOException if there is an I/O error
*
* NOTE: This test is SKIPPED as MXParser do not supports DOCDECL parsing.
*/
// @Test
public void testhst_bh_006()
throws IOException
{
try ( Reader reader = new FileReader( new File( testResourcesDir, "006.xml" ) ) )
{
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "xmlns:foo is an attribute as far as validation is concerned and must be declared" );
}
catch ( XmlPullParserException e )
{
assertTrue( true );
}
}

/**
* Test ID: <pre>hst-lhs-007</pre>
* Test URI: <pre>007.xml</pre>
* Comment: <pre>UTF-8 BOM plus xml decl of iso-8859-1 incompatible</pre>
* Sections: <pre>4.3.3</pre>
* Version:
*
* @throws IOException if there is an I/O error
*/
@Test
public void testhst_lhs_007()
throws IOException
{
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "007.xml" ) );
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) )
{
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "UTF-8 BOM plus xml decl of iso-8859-1 incompatible" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "UTF-8 BOM plus xml decl of iso-8859-1 is incompatible" ) );
}
}

/**
* Test ID: <pre>hst-lhs-008</pre>
* Test URI: <pre>008.xml</pre>
* Comment: <pre>UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible</pre>
* Sections: <pre>4.3.3</pre>
* Version:
*
* @throws IOException if there is an I/O error
*/
@Test
public void testhst_lhs_008()
throws IOException
{
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "008.xml" ) );
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_16 ) )
{
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "UTF-16 BOM plus xml decl of utf-8 is incompatible" ) );
}
}

/**
* Test ID: <pre>hst-lhs-009</pre>
* Test URI: <pre>009.xml</pre>
* Comment: <pre>UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible</pre>
* Sections: <pre>4.3.3</pre>
* Version:
*
* @throws IOException if there is an I/O error
*/
@Test
public void testhst_lhs_009()
throws IOException
{
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "009.xml" ) );
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) )
{
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
}
}

}
4 changes: 4 additions & 0 deletions src/test/resources/xmlconf/eduni/misc/001.xml
@@ -0,0 +1,4 @@
<!DOCTYPE p [
<!ELEMENT p (#PCDATA)>
]>
<p>Fa&#xFF000000F6;il</p> <!-- 32 bit integer overflow -->
4 changes: 4 additions & 0 deletions src/test/resources/xmlconf/eduni/misc/002.xml
@@ -0,0 +1,4 @@
<!DOCTYPE p [
<!ELEMENT p (#PCDATA)>
]>
<p>Fa&#4294967542;il</p> <!-- 32 bit integer overflow -->
4 changes: 4 additions & 0 deletions src/test/resources/xmlconf/eduni/misc/003.xml
@@ -0,0 +1,4 @@
<!DOCTYPE p [
<!ELEMENT p (#PCDATA)>
]>
<p>Fa&#xFFFFFFFF000000F6;il</p> <!-- 64 bit integer overflow -->
4 changes: 4 additions & 0 deletions src/test/resources/xmlconf/eduni/misc/004.xml
@@ -0,0 +1,4 @@
<!DOCTYPE p [
<!ELEMENT p (#PCDATA)>
]>
<p>Fa&#18446744073709551862;il</p> <!-- 64 bit integer overflow -->
2 changes: 2 additions & 0 deletions src/test/resources/xmlconf/eduni/misc/005.xml
@@ -0,0 +1,2 @@
<!DOCTYPE x [ <!ELEMENT x EMPTY> ]>
<x xmlns:xml='http://www.w3.org/XML/1998/namespace'/>

6 comments on commit 1e18ddc

@michael-o
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just hit a regression with this change. Maven Doxia Sitetools from fail with 3.4.0, it says:

[ERROR] Tests run: 7, Failures: 0, Errors: 1, Skipped: 0, Time elapsed: 2.234 s <<< FAILURE! - in org.apache.maven.doxia.siterenderer.DefaultSiteRendererTest
[ERROR] org.apache.maven.doxia.siterenderer.DefaultSiteRendererTest.testRender  Time elapsed: 0.106 s  <<< ERROR!
org.codehaus.plexus.util.xml.pull.XmlPullParserException: UTF-8 BOM plus xml decl of ISO-8859-1 is incompatible (position: START_DOCUMENT seen <?xml version="1.0" encoding="ISO-8859-1"... @1:41)
        at org.apache.maven.doxia.siterenderer.DefaultSiteRendererTest.testRender(DefaultSiteRendererTest.java:253)

@belingueres Can you check?

@belingueres
Copy link
Contributor Author

@belingueres belingueres commented on 1e18ddc Feb 15, 2022 via email

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@michael-o
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cannot confirm:

[INFO] Running org.apache.maven.doxia.tools.SiteToolTest
[ERROR] Tests run: 10, Failures: 0, Errors: 2, Skipped: 0, Time elapsed: 3.388 s <<< FAILURE! - in org.apache.maven.doxia.tools.SiteToolTest
[ERROR] testGetDecorationModel(org.apache.maven.doxia.tools.SiteToolTest)  Time elapsed: 0.074 s  <<< ERROR!
org.apache.maven.doxia.tools.SiteToolException: Error parsing site descriptor
        at org.apache.maven.doxia.tools.SiteToolTest.testGetDecorationModel(SiteToolTest.java:257)
Caused by: org.codehaus.plexus.util.xml.pull.XmlPullParserException: UTF-8 BOM plus xml decl of ISO-8859-1 is incompatible (position: START_DOCUMENT seen <?xml version="1.0" encoding="ISO-8859-1"... @1:41)
        at org.apache.maven.doxia.tools.SiteToolTest.testGetDecorationModel(SiteToolTest.java:257)

[ERROR] testDecorationModelInheritanceAndInterpolation(org.apache.maven.doxia.tools.SiteToolTest)  Time elapsed: 0.064 s  <<< ERROR!
org.apache.maven.doxia.tools.SiteToolException: Error parsing site descriptor
        at org.apache.maven.doxia.tools.SiteToolTest.testDecorationModelInheritanceAndInterpolation(SiteToolTest.java:368)
Caused by: org.codehaus.plexus.util.xml.pull.XmlPullParserException: UTF-8 BOM plus xml decl of ISO-8859-1 is incompatible (position: START_DOCUMENT seen <?xml version="1.0" encoding="ISO-8859-1"... @1:41)
        at org.apache.maven.doxia.tools.SiteToolTest.testDecorationModelInheritanceAndInterpolation(SiteToolTest.java:368)

[INFO]
[INFO] Results:
[INFO]
[ERROR] Errors:
[ERROR]   SiteToolTest.testDecorationModelInheritanceAndInterpolation:368 � SiteTool Err...
[ERROR]   SiteToolTest.testGetDecorationModel:257 � SiteTool Error parsing site descript...
[INFO]
[ERROR] Tests run: 15, Failures: 0, Errors: 2, Skipped: 0
[INFO]
[INFO] ------------------------------------------------------------------------
[INFO] Reactor Summary for Doxia Sitetools 2.0.0-M2-SNAPSHOT:
[INFO]
[INFO] Doxia Sitetools .................................... SUCCESS [  3.038 s]
[INFO] Doxia Sitetools :: Decoration Model ................ SUCCESS [  4.193 s]
[INFO] Doxia Sitetools :: Skin Model ...................... SUCCESS [  0.464 s]
[INFO] Doxia Sitetools :: Integration Tools ............... FAILURE [  6.529 s]
[INFO] Doxia Sitetools :: Site Renderer ................... SKIPPED
[INFO] ------------------------------------------------------------------------
[INFO] BUILD FAILURE
[INFO] ------------------------------------------------------------------------
[INFO] Total time:  14.382 s
[INFO] Finished at: 2022-02-16T20:38:09+01:00
[INFO] ------------------------------------------------------------------------
[ERROR] Failed to execute goal org.apache.maven.plugins:maven-surefire-plugin:2.22.0:test (default-test) on project doxia-integration-tools: There are test failures.
[ERROR]
[ERROR] Please refer to D:\Entwicklung\Projekte\maven-doxia-sitetools\doxia-integration-tools\target\surefire-reports for the individual test results.
[ERROR] Please refer to dump files (if any exist) [date]-jvmRun[N].dump, [date].dumpstream and [date]-jvmRun[N].dumpstream.
[ERROR] -> [Help 1]
[ERROR]
[ERROR] To see the full stack trace of the errors, re-run Maven with the -e switch.
[ERROR] Re-run Maven using the -X switch to enable full debug logging.
[ERROR]
[ERROR] For more information about the errors and possible solutions, please read the following articles:
[ERROR] [Help 1] http://cwiki.apache.org/confluence/display/MAVEN/MojoFailureException
[ERROR]
[ERROR] After correcting the problems, you can resume the build with the command
[ERROR]   mvn <args> -rf :doxia-integration-tools
D:\Entwicklung\Projekte\maven-doxia-sitetools [master ≡ +3 ~1 -0 !]> git diff
diff --git a/pom.xml b/pom.xml
index 2681b97..ff0e23a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -174,7 +174,7 @@ under the License.
       <dependency>
         <groupId>org.codehaus.plexus</groupId>
         <artifactId>plexus-utils</artifactId>
-        <version>3.3.0</version>
+        <version>3.4.1</version>
       </dependency>
       <!--  SLF4J -->

@belingueres
Copy link
Contributor Author

@belingueres belingueres commented on 1e18ddc Feb 17, 2022 via email

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@michael-o
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No actions required, I need to understand why cause and figure out what we have done wrong. If you have an idea how to fix this inside Sitetools, I'd be grateful.

@michael-o
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue has been addressed in 3.4.2.

Please sign in to comment.