From 5ff5d587119dcdac88e40e191b1996ee7a97d581 Mon Sep 17 00:00:00 2001
From: Gabriel Belingueres
Date: Sat, 2 Apr 2022 12:31:39 -0300
Subject: [PATCH] Fixed regressions:
* #163 - new case: Don't assume UTF8 as default, to allow parsing from String.
* #194 - Incorrect getText() after parsing the DOCDECL section.
---
.../plexus/util/xml/pull/MXParser.java | 98 +++++--
.../plexus/util/xml/pull/MXParserTest.java | 263 ++++++++++++++++++
.../resources/xml/test-entities-in-attr.xml | 9 +
src/test/resources/xml/test-entities.xml | 6 +
4 files changed, 352 insertions(+), 24 deletions(-)
create mode 100644 src/test/resources/xml/test-entities-in-attr.xml
create mode 100644 src/test/resources/xml/test-entities.xml
diff --git a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java
index 3874f572..a7a034d2 100644
--- a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java
+++ b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java
@@ -124,7 +124,7 @@ private String newStringIntern( char[] cbuf, int off, int len )
// private String elValue[];
private int elNamespaceCount[];
- private String fileEncoding = "UTF8";
+ private String fileEncoding = null;
/**
* Make sure that we have enough space to keep element stack if passed size. It will always create one additional
@@ -587,8 +587,8 @@ else if ( FEATURE_XML_ROUNDTRIP.equals( name ) )
}
}
- /**
- * Unknown properties are always returned as false
+ /**
+ * Unknown properties are always returned as false
*/
@Override
public boolean getFeature( String name )
@@ -2677,7 +2677,15 @@ else if ( ch == '\t' || ch == '\n' || ch == '\r' )
private char[] charRefOneCharBuf = new char[1];
- private char[] parseEntityRef()
+ /**
+ * parse Entity Ref, either a character entity or one of the predefined name entities.
+ *
+ * @return -1 if found a valid character reference, or one of the predefined character reference names
+ * (charRefOneCharBuf contains the replaced char). Returns the length of the found entity name, otherwise.
+ * @throws XmlPullParserException if invalid XML is detected.
+ * @throws IOException if an I/O error is found.
+ */
+ private int parseCharOrPredefinedEntityRef()
throws XmlPullParserException, IOException
{
// entity reference http://www.w3.org/TR/2000/REC-xml-20001006#NT-Reference
@@ -2777,12 +2785,12 @@ else if ( ch >= 'A' && ch <= 'F' )
{
text = newString( charRefOneCharBuf, 0, charRefOneCharBuf.length );
}
- return charRefOneCharBuf;
+ return -1;
}
else
{
// [68] EntityRef ::= '&' Name ';'
- // scan anem until ;
+ // scan name until ;
if ( !isNameStartChar( ch ) )
{
throw new XmlPullParserException( "entity reference names can not start with character '"
@@ -2811,7 +2819,7 @@ else if ( ch >= 'A' && ch <= 'F' )
text = "<";
}
charRefOneCharBuf[0] = '<';
- return charRefOneCharBuf;
+ return -1;
// if(paramPC || isParserTokenizing) {
// if(pcEnd >= pc.length) ensurePC();
// pc[pcEnd++] = '<';
@@ -2824,7 +2832,7 @@ else if ( len == 3 && buf[posStart] == 'a' && buf[posStart + 1] == 'm' && buf[po
text = "&";
}
charRefOneCharBuf[0] = '&';
- return charRefOneCharBuf;
+ return -1;
}
else if ( len == 2 && buf[posStart] == 'g' && buf[posStart + 1] == 't' )
{
@@ -2833,7 +2841,7 @@ else if ( len == 2 && buf[posStart] == 'g' && buf[posStart + 1] == 't' )
text = ">";
}
charRefOneCharBuf[0] = '>';
- return charRefOneCharBuf;
+ return -1;
}
else if ( len == 4 && buf[posStart] == 'a' && buf[posStart + 1] == 'p' && buf[posStart + 2] == 'o'
&& buf[posStart + 3] == 's' )
@@ -2843,7 +2851,7 @@ else if ( len == 4 && buf[posStart] == 'a' && buf[posStart + 1] == 'p' && buf[po
text = "'";
}
charRefOneCharBuf[0] = '\'';
- return charRefOneCharBuf;
+ return -1;
}
else if ( len == 4 && buf[posStart] == 'q' && buf[posStart + 1] == 'u' && buf[posStart + 2] == 'o'
&& buf[posStart + 3] == 't' )
@@ -2853,20 +2861,51 @@ else if ( len == 4 && buf[posStart] == 'q' && buf[posStart + 1] == 'u' && buf[po
text = "\"";
}
charRefOneCharBuf[0] = '"';
- return charRefOneCharBuf;
- }
- else
- {
- final char[] result = lookuEntityReplacement( len );
- if ( result != null )
- {
- return result;
- }
+ return -1;
}
- if ( tokenize )
- text = null;
- return null;
+ return len; // name not found
+ }
+ }
+
+ /**
+ * Parse an entity reference inside the DOCDECL section.
+ *
+ * @throws XmlPullParserException if invalid XML is detected.
+ * @throws IOException if an I/O error is found.
+ */
+ private void parseEntityRefInDocDecl()
+ throws XmlPullParserException, IOException
+ {
+ final int len = parseCharOrPredefinedEntityRef();
+ if ( len < 0 )
+ return;
+ if ( tokenize )
+ text = null;
+ }
+
+ /**
+ * Parse an entity reference inside a tag or attribute.
+ *
+ * @return the char array with the replaced character entity, the replaced custom entity, or null if no replacement
+ * could be found.
+ * @throws XmlPullParserException
+ * @throws IOException
+ */
+ private char[] parseEntityRef()
+ throws XmlPullParserException, IOException
+ {
+ final int len = parseCharOrPredefinedEntityRef();
+ if ( len < 0 )
+ return charRefOneCharBuf;
+
+ final char[] result = lookuEntityReplacement( len );
+ if ( result != null )
+ {
+ return result;
}
+ if ( tokenize )
+ text = null;
+ return null;
}
/**
@@ -2977,7 +3016,7 @@ else if (isValidCodePoint( ch ))
}
else
{
- throw new XmlPullParserException( "Illegal character 0x" + Integer.toHexString(((int) ch)) + " found in comment", this, null );
+ throw new XmlPullParserException( "Illegal character 0x" + Integer.toHexString((ch)) + " found in comment", this, null );
}
if ( normalizeIgnorableWS )
{
@@ -3484,7 +3523,7 @@ else if ( ch == '>' && bracketLevel == 0 )
break;
else if ( ch == '&' )
{
- extractEntityRef();
+ extractEntityRefInDocDecl();
}
if ( normalizeIgnorableWS )
{
@@ -3538,6 +3577,17 @@ else if ( ch == '\n' )
posEnd = pos - 1;
}
+ private void extractEntityRefInDocDecl()
+ throws XmlPullParserException, IOException
+ {
+ // extractEntityRef
+ posEnd = pos - 1;
+
+ int prevPosStart = posStart;
+ parseEntityRefInDocDecl();
+ posStart = prevPosStart;
+ }
+
private void extractEntityRef()
throws XmlPullParserException, IOException
{
diff --git a/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java b/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java
index e0be666a..f24dfe86 100644
--- a/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java
+++ b/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java
@@ -17,6 +17,7 @@
*/
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
@@ -29,6 +30,7 @@
import java.nio.file.Files;
import java.nio.file.Paths;
+import org.codehaus.plexus.util.IOUtil;
import org.codehaus.plexus.util.ReaderFactory;
import org.junit.Test;
@@ -898,4 +900,265 @@ public void testEncodingISO_8859_1_setInputStream()
}
}
+ /**
+ * Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
+ *
+ * Another case of bug #163: File encoding information is lost after the input file is copied to a String.
+ *
+ * @throws IOException if IO error.
+ *
+ * @since 3.4.2
+ */
+ @Test
+ public void testEncodingISO_8859_1setStringReader()
+ throws IOException
+ {
+ try ( Reader reader =
+ ReaderFactory.newXmlReader( new File( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) )
+ {
+ MXParser parser = new MXParser();
+ String xmlFileContents = IOUtil.toString( reader );
+ parser.setInput( new StringReader( xmlFileContents ) );
+ while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
+ ;
+ assertTrue( true );
+ }
+ catch ( XmlPullParserException e )
+ {
+ fail( "should not raise exception: " + e );
+ }
+ }
+
+ /**
+ *
+ * Test custom Entity not found.
+ *
+ *
+ * Regression test: assure same behavior of MXParser from plexus-utils 3.3.0.
+ *
+ * @throws java.lang.Exception if any.
+ *
+ * @since 3.4.2
+ */
+ @Test
+ public void testCustomEntityNotFoundInText()
+ throws Exception
+ {
+ MXParser parser = new MXParser();
+
+ String input = "&otherentity;";
+ parser.setInput( new StringReader( input ) );
+ parser.defineEntityReplacementText( "myentity", "replacement" );
+
+ try
+ {
+ assertEquals( XmlPullParser.START_TAG, parser.next() );
+ assertEquals( XmlPullParser.TEXT, parser.next() );
+ fail( "should raise exception" );
+ }
+ catch ( XmlPullParserException e )
+ {
+ assertTrue( e.getMessage().contains( "could not resolve entity named 'otherentity' (position: START_TAG seen &otherentity;... @1:19)" ) );
+ assertEquals( XmlPullParser.START_TAG, parser.getEventType() ); // not an ENTITY_REF
+ assertEquals( "otherentity", parser.getText() );
+ }
+ }
+
+ /**
+ *
+ * Test custom Entity not found, with tokenize.
+ *
+ *
+ * Regression test: assure same behavior of MXParser from plexus-utils 3.3.0.
+ *
+ * @throws java.lang.Exception if any.
+ *
+ * @since 3.4.2
+ */
+ @Test
+ public void testCustomEntityNotFoundInTextTokenize()
+ throws Exception
+ {
+ MXParser parser = new MXParser();
+
+ String input = "&otherentity;";
+ parser.setInput( new StringReader( input ) );
+ parser.defineEntityReplacementText( "myentity", "replacement" );
+
+ try
+ {
+ assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
+ assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
+ assertNull( parser.getText() );
+ }
+ catch ( XmlPullParserException e )
+ {
+ fail( "should not throw exception if tokenize" );
+ }
+ }
+
+ /**
+ *
+ * Test custom Entity not found in attribute.
+ *
+ *
+ * Regression test: assure same behavior of MXParser from plexus-utils 3.3.0.
+ *
+ * @throws java.lang.Exception if any.
+ *
+ * @since 3.4.2
+ */
+ @Test
+ public void testCustomEntityNotFoundInAttr()
+ throws Exception
+ {
+ MXParser parser = new MXParser();
+
+ String input = "sometext";
+ parser.setInput( new StringReader( input ) );
+ parser.defineEntityReplacementText( "myentity", "replacement" );
+
+ try
+ {
+ assertEquals( XmlPullParser.START_TAG, parser.next() );
+ fail( "should raise exception" );
+ }
+ catch ( XmlPullParserException e )
+ {
+ assertTrue( e.getMessage().contains( "could not resolve entity named 'otherentity' (position: START_DOCUMENT seen
+ * Test custom Entity not found in attribute, with tokenize.
+ *
+ *
+ * Regression test: assure same behavior of MXParser from plexus-utils 3.3.0.
+ *
+ * @throws java.lang.Exception if any.
+ *
+ * @since 3.4.2
+ */
+ @Test
+ public void testCustomEntityNotFoundInAttrTokenize()
+ throws Exception
+ {
+ MXParser parser = new MXParser();
+
+ String input = "sometext";
+ parser.setInput( new StringReader( input ) );
+ parser.defineEntityReplacementText( "myentity", "replacement" );
+
+ try
+ {
+ assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
+ fail( "should raise exception" );
+ }
+ catch ( XmlPullParserException e )
+ {
+ assertTrue( e.getMessage().contains( "could not resolve entity named 'otherentity' (position: START_DOCUMENT seen Issue #194: Incorrect getText() after parsing the DOCDECL section>
+ *
+ * test DOCDECL text with myCustomEntity that cannot be resolved.
+ *
+ * Regression test: assure same behavior of MXParser from plexus-utils 3.3.0.
+ *
+ * @throws java.lang.Exception if any.
+ *
+ * @since 3.4.2
+ */
+ @Test
+ public void testDocdeclTextWithEntities()
+ throws IOException
+ {
+ try ( Reader reader =
+ ReaderFactory.newXmlReader( new File( "src/test/resources/xml", "test-entities.xml" ) ) )
+ {
+ MXParser parser = new MXParser();
+ parser.setInput( reader );
+ assertEquals( XmlPullParser.PROCESSING_INSTRUCTION, parser.nextToken() );
+ assertEquals( XmlPullParser.IGNORABLE_WHITESPACE, parser.nextToken() );
+ assertEquals( XmlPullParser.DOCDECL, parser.nextToken() );
+ assertEquals( " document [\n"
+ + "\n"
+ + "\n"
+ + "]", parser.getText() );
+ assertEquals( XmlPullParser.IGNORABLE_WHITESPACE, parser.nextToken() );
+ assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
+ assertEquals( "document", parser.getName() );
+ assertEquals( XmlPullParser.TEXT, parser.next() );
+
+ fail( "should fail to resolve 'myCustomEntity' entity");
+ }
+ catch ( XmlPullParserException e )
+ {
+ assertTrue( e.getMessage().contains( "could not resolve entity named 'myCustomEntity'" ));
+ }
+ }
+
+ /**
+ * Issue #194: Incorrect getText() after parsing the DOCDECL section>
+ *
+ *
test DOCDECL text with entities appearing in attributes.
+ *
+ * Regression test: assure same behavior of MXParser from plexus-utils 3.3.0.
+ *
+ * @throws java.lang.Exception if any.
+ *
+ * @since 3.4.2
+ */
+ @Test
+ public void DocdeclTextWithEntitiesInAttributes()
+ throws IOException
+ {
+ try ( Reader reader =
+ ReaderFactory.newXmlReader( new File( "src/test/resources/xml", "test-entities-in-attr.xml" ) ) )
+ {
+ MXParser parser = new MXParser();
+ parser.setInput( reader );
+ parser.defineEntityReplacementText( "nbsp", " " );
+ parser.defineEntityReplacementText( "Alpha", "Α" );
+ parser.defineEntityReplacementText( "tritPos", "𝟭" );
+ parser.defineEntityReplacementText( "flo", "ř" );
+ parser.defineEntityReplacementText( "myCustomEntity", "&flo;" );
+ assertEquals( XmlPullParser.PROCESSING_INSTRUCTION, parser.nextToken() );
+ assertEquals( XmlPullParser.IGNORABLE_WHITESPACE, parser.nextToken() );
+ assertEquals( XmlPullParser.DOCDECL, parser.nextToken() );
+ assertEquals( " document [\n"
+ + " \n"
+ + " \n"
+ + " \n"
+ + "\n"
+ + "\n"
+ + "]", parser.getText() );
+ assertEquals( XmlPullParser.IGNORABLE_WHITESPACE, parser.nextToken() );
+ assertEquals( XmlPullParser.START_TAG, parser.nextToken() );
+ assertEquals( "document", parser.getName() );
+ assertEquals( 1, parser.getAttributeCount() );
+ assertEquals( "name", parser.getAttributeName( 0 ) );
+ assertEquals( "section name with entities: '&' 'Α' '<' ' ' '>' '𝟭' ''' 'ř' '\"'",
+ parser.getAttributeValue( 0 ) );
+
+ assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() );
+ assertEquals( "myCustomEntity", parser.getName() );
+ assertEquals( "ř", parser.getText() );
+
+ assertEquals( XmlPullParser.END_TAG, parser.nextToken() );
+ assertEquals( XmlPullParser.END_DOCUMENT, parser.nextToken() );
+ }
+ catch ( XmlPullParserException e )
+ {
+ fail( "should not raise exception: " + e );
+ }
+ }
+
}
diff --git a/src/test/resources/xml/test-entities-in-attr.xml b/src/test/resources/xml/test-entities-in-attr.xml
new file mode 100644
index 00000000..a423c995
--- /dev/null
+++ b/src/test/resources/xml/test-entities-in-attr.xml
@@ -0,0 +1,9 @@
+
+
+
+
+
+
+]>
+&myCustomEntity;
\ No newline at end of file
diff --git a/src/test/resources/xml/test-entities.xml b/src/test/resources/xml/test-entities.xml
new file mode 100644
index 00000000..e1d6d17a
--- /dev/null
+++ b/src/test/resources/xml/test-entities.xml
@@ -0,0 +1,6 @@
+
+
+
+]>
+&myCustomEntity;
\ No newline at end of file