From 5ff5d587119dcdac88e40e191b1996ee7a97d581 Mon Sep 17 00:00:00 2001 From: Gabriel Belingueres Date: Sat, 2 Apr 2022 12:31:39 -0300 Subject: [PATCH] Fixed regressions: * #163 - new case: Don't assume UTF8 as default, to allow parsing from String. * #194 - Incorrect getText() after parsing the DOCDECL section. --- .../plexus/util/xml/pull/MXParser.java | 98 +++++-- .../plexus/util/xml/pull/MXParserTest.java | 263 ++++++++++++++++++ .../resources/xml/test-entities-in-attr.xml | 9 + src/test/resources/xml/test-entities.xml | 6 + 4 files changed, 352 insertions(+), 24 deletions(-) create mode 100644 src/test/resources/xml/test-entities-in-attr.xml create mode 100644 src/test/resources/xml/test-entities.xml diff --git a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java index 3874f572..a7a034d2 100644 --- a/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java +++ b/src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java @@ -124,7 +124,7 @@ private String newStringIntern( char[] cbuf, int off, int len ) // private String elValue[]; private int elNamespaceCount[]; - private String fileEncoding = "UTF8"; + private String fileEncoding = null; /** * Make sure that we have enough space to keep element stack if passed size. It will always create one additional @@ -587,8 +587,8 @@ else if ( FEATURE_XML_ROUNDTRIP.equals( name ) ) } } - /** - * Unknown properties are always returned as false + /** + * Unknown properties are always returned as false */ @Override public boolean getFeature( String name ) @@ -2677,7 +2677,15 @@ else if ( ch == '\t' || ch == '\n' || ch == '\r' ) private char[] charRefOneCharBuf = new char[1]; - private char[] parseEntityRef() + /** + * parse Entity Ref, either a character entity or one of the predefined name entities. + * + * @return -1 if found a valid character reference, or one of the predefined character reference names + * (charRefOneCharBuf contains the replaced char). Returns the length of the found entity name, otherwise. + * @throws XmlPullParserException if invalid XML is detected. + * @throws IOException if an I/O error is found. + */ + private int parseCharOrPredefinedEntityRef() throws XmlPullParserException, IOException { // entity reference http://www.w3.org/TR/2000/REC-xml-20001006#NT-Reference @@ -2777,12 +2785,12 @@ else if ( ch >= 'A' && ch <= 'F' ) { text = newString( charRefOneCharBuf, 0, charRefOneCharBuf.length ); } - return charRefOneCharBuf; + return -1; } else { // [68] EntityRef ::= '&' Name ';' - // scan anem until ; + // scan name until ; if ( !isNameStartChar( ch ) ) { throw new XmlPullParserException( "entity reference names can not start with character '" @@ -2811,7 +2819,7 @@ else if ( ch >= 'A' && ch <= 'F' ) text = "<"; } charRefOneCharBuf[0] = '<'; - return charRefOneCharBuf; + return -1; // if(paramPC || isParserTokenizing) { // if(pcEnd >= pc.length) ensurePC(); // pc[pcEnd++] = '<'; @@ -2824,7 +2832,7 @@ else if ( len == 3 && buf[posStart] == 'a' && buf[posStart + 1] == 'm' && buf[po text = "&"; } charRefOneCharBuf[0] = '&'; - return charRefOneCharBuf; + return -1; } else if ( len == 2 && buf[posStart] == 'g' && buf[posStart + 1] == 't' ) { @@ -2833,7 +2841,7 @@ else if ( len == 2 && buf[posStart] == 'g' && buf[posStart + 1] == 't' ) text = ">"; } charRefOneCharBuf[0] = '>'; - return charRefOneCharBuf; + return -1; } else if ( len == 4 && buf[posStart] == 'a' && buf[posStart + 1] == 'p' && buf[posStart + 2] == 'o' && buf[posStart + 3] == 's' ) @@ -2843,7 +2851,7 @@ else if ( len == 4 && buf[posStart] == 'a' && buf[posStart + 1] == 'p' && buf[po text = "'"; } charRefOneCharBuf[0] = '\''; - return charRefOneCharBuf; + return -1; } else if ( len == 4 && buf[posStart] == 'q' && buf[posStart + 1] == 'u' && buf[posStart + 2] == 'o' && buf[posStart + 3] == 't' ) @@ -2853,20 +2861,51 @@ else if ( len == 4 && buf[posStart] == 'q' && buf[posStart + 1] == 'u' && buf[po text = "\""; } charRefOneCharBuf[0] = '"'; - return charRefOneCharBuf; - } - else - { - final char[] result = lookuEntityReplacement( len ); - if ( result != null ) - { - return result; - } + return -1; } - if ( tokenize ) - text = null; - return null; + return len; // name not found + } + } + + /** + * Parse an entity reference inside the DOCDECL section. + * + * @throws XmlPullParserException if invalid XML is detected. + * @throws IOException if an I/O error is found. + */ + private void parseEntityRefInDocDecl() + throws XmlPullParserException, IOException + { + final int len = parseCharOrPredefinedEntityRef(); + if ( len < 0 ) + return; + if ( tokenize ) + text = null; + } + + /** + * Parse an entity reference inside a tag or attribute. + * + * @return the char array with the replaced character entity, the replaced custom entity, or null if no replacement + * could be found. + * @throws XmlPullParserException + * @throws IOException + */ + private char[] parseEntityRef() + throws XmlPullParserException, IOException + { + final int len = parseCharOrPredefinedEntityRef(); + if ( len < 0 ) + return charRefOneCharBuf; + + final char[] result = lookuEntityReplacement( len ); + if ( result != null ) + { + return result; } + if ( tokenize ) + text = null; + return null; } /** @@ -2977,7 +3016,7 @@ else if (isValidCodePoint( ch )) } else { - throw new XmlPullParserException( "Illegal character 0x" + Integer.toHexString(((int) ch)) + " found in comment", this, null ); + throw new XmlPullParserException( "Illegal character 0x" + Integer.toHexString((ch)) + " found in comment", this, null ); } if ( normalizeIgnorableWS ) { @@ -3484,7 +3523,7 @@ else if ( ch == '>' && bracketLevel == 0 ) break; else if ( ch == '&' ) { - extractEntityRef(); + extractEntityRefInDocDecl(); } if ( normalizeIgnorableWS ) { @@ -3538,6 +3577,17 @@ else if ( ch == '\n' ) posEnd = pos - 1; } + private void extractEntityRefInDocDecl() + throws XmlPullParserException, IOException + { + // extractEntityRef + posEnd = pos - 1; + + int prevPosStart = posStart; + parseEntityRefInDocDecl(); + posStart = prevPosStart; + } + private void extractEntityRef() throws XmlPullParserException, IOException { diff --git a/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java b/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java index e0be666a..f24dfe86 100644 --- a/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java +++ b/src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java @@ -17,6 +17,7 @@ */ import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -29,6 +30,7 @@ import java.nio.file.Files; import java.nio.file.Paths; +import org.codehaus.plexus.util.IOUtil; import org.codehaus.plexus.util.ReaderFactory; import org.junit.Test; @@ -898,4 +900,265 @@ public void testEncodingISO_8859_1_setInputStream() } } + /** + * Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163 + * + * Another case of bug #163: File encoding information is lost after the input file is copied to a String. + * + * @throws IOException if IO error. + * + * @since 3.4.2 + */ + @Test + public void testEncodingISO_8859_1setStringReader() + throws IOException + { + try ( Reader reader = + ReaderFactory.newXmlReader( new File( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) ) + { + MXParser parser = new MXParser(); + String xmlFileContents = IOUtil.toString( reader ); + parser.setInput( new StringReader( xmlFileContents ) ); + while ( parser.nextToken() != XmlPullParser.END_DOCUMENT ) + ; + assertTrue( true ); + } + catch ( XmlPullParserException e ) + { + fail( "should not raise exception: " + e ); + } + } + + /** + *

+ * Test custom Entity not found. + *

+ * + * Regression test: assure same behavior of MXParser from plexus-utils 3.3.0. + * + * @throws java.lang.Exception if any. + * + * @since 3.4.2 + */ + @Test + public void testCustomEntityNotFoundInText() + throws Exception + { + MXParser parser = new MXParser(); + + String input = "&otherentity;"; + parser.setInput( new StringReader( input ) ); + parser.defineEntityReplacementText( "myentity", "replacement" ); + + try + { + assertEquals( XmlPullParser.START_TAG, parser.next() ); + assertEquals( XmlPullParser.TEXT, parser.next() ); + fail( "should raise exception" ); + } + catch ( XmlPullParserException e ) + { + assertTrue( e.getMessage().contains( "could not resolve entity named 'otherentity' (position: START_TAG seen &otherentity;... @1:19)" ) ); + assertEquals( XmlPullParser.START_TAG, parser.getEventType() ); // not an ENTITY_REF + assertEquals( "otherentity", parser.getText() ); + } + } + + /** + *

+ * Test custom Entity not found, with tokenize. + *

+ * + * Regression test: assure same behavior of MXParser from plexus-utils 3.3.0. + * + * @throws java.lang.Exception if any. + * + * @since 3.4.2 + */ + @Test + public void testCustomEntityNotFoundInTextTokenize() + throws Exception + { + MXParser parser = new MXParser(); + + String input = "&otherentity;"; + parser.setInput( new StringReader( input ) ); + parser.defineEntityReplacementText( "myentity", "replacement" ); + + try + { + assertEquals( XmlPullParser.START_TAG, parser.nextToken() ); + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertNull( parser.getText() ); + } + catch ( XmlPullParserException e ) + { + fail( "should not throw exception if tokenize" ); + } + } + + /** + *

+ * Test custom Entity not found in attribute. + *

+ * + * Regression test: assure same behavior of MXParser from plexus-utils 3.3.0. + * + * @throws java.lang.Exception if any. + * + * @since 3.4.2 + */ + @Test + public void testCustomEntityNotFoundInAttr() + throws Exception + { + MXParser parser = new MXParser(); + + String input = "sometext"; + parser.setInput( new StringReader( input ) ); + parser.defineEntityReplacementText( "myentity", "replacement" ); + + try + { + assertEquals( XmlPullParser.START_TAG, parser.next() ); + fail( "should raise exception" ); + } + catch ( XmlPullParserException e ) + { + assertTrue( e.getMessage().contains( "could not resolve entity named 'otherentity' (position: START_DOCUMENT seen + * Test custom Entity not found in attribute, with tokenize. + *

+ * + * Regression test: assure same behavior of MXParser from plexus-utils 3.3.0. + * + * @throws java.lang.Exception if any. + * + * @since 3.4.2 + */ + @Test + public void testCustomEntityNotFoundInAttrTokenize() + throws Exception + { + MXParser parser = new MXParser(); + + String input = "sometext"; + parser.setInput( new StringReader( input ) ); + parser.defineEntityReplacementText( "myentity", "replacement" ); + + try + { + assertEquals( XmlPullParser.START_TAG, parser.nextToken() ); + fail( "should raise exception" ); + } + catch ( XmlPullParserException e ) + { + assertTrue( e.getMessage().contains( "could not resolve entity named 'otherentity' (position: START_DOCUMENT seen Issue #194: Incorrect getText() after parsing the DOCDECL section + * + *

test DOCDECL text with myCustomEntity that cannot be resolved.

+ * + * Regression test: assure same behavior of MXParser from plexus-utils 3.3.0. + * + * @throws java.lang.Exception if any. + * + * @since 3.4.2 + */ + @Test + public void testDocdeclTextWithEntities() + throws IOException + { + try ( Reader reader = + ReaderFactory.newXmlReader( new File( "src/test/resources/xml", "test-entities.xml" ) ) ) + { + MXParser parser = new MXParser(); + parser.setInput( reader ); + assertEquals( XmlPullParser.PROCESSING_INSTRUCTION, parser.nextToken() ); + assertEquals( XmlPullParser.IGNORABLE_WHITESPACE, parser.nextToken() ); + assertEquals( XmlPullParser.DOCDECL, parser.nextToken() ); + assertEquals( " document [\n" + + "\n" + + "\n" + + "]", parser.getText() ); + assertEquals( XmlPullParser.IGNORABLE_WHITESPACE, parser.nextToken() ); + assertEquals( XmlPullParser.START_TAG, parser.nextToken() ); + assertEquals( "document", parser.getName() ); + assertEquals( XmlPullParser.TEXT, parser.next() ); + + fail( "should fail to resolve 'myCustomEntity' entity"); + } + catch ( XmlPullParserException e ) + { + assertTrue( e.getMessage().contains( "could not resolve entity named 'myCustomEntity'" )); + } + } + + /** + *

Issue #194: Incorrect getText() after parsing the DOCDECL section + * + *

test DOCDECL text with entities appearing in attributes.

+ * + * Regression test: assure same behavior of MXParser from plexus-utils 3.3.0. + * + * @throws java.lang.Exception if any. + * + * @since 3.4.2 + */ + @Test + public void DocdeclTextWithEntitiesInAttributes() + throws IOException + { + try ( Reader reader = + ReaderFactory.newXmlReader( new File( "src/test/resources/xml", "test-entities-in-attr.xml" ) ) ) + { + MXParser parser = new MXParser(); + parser.setInput( reader ); + parser.defineEntityReplacementText( "nbsp", " " ); + parser.defineEntityReplacementText( "Alpha", "Α" ); + parser.defineEntityReplacementText( "tritPos", "𝟭" ); + parser.defineEntityReplacementText( "flo", "ř" ); + parser.defineEntityReplacementText( "myCustomEntity", "&flo;" ); + assertEquals( XmlPullParser.PROCESSING_INSTRUCTION, parser.nextToken() ); + assertEquals( XmlPullParser.IGNORABLE_WHITESPACE, parser.nextToken() ); + assertEquals( XmlPullParser.DOCDECL, parser.nextToken() ); + assertEquals( " document [\n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "]", parser.getText() ); + assertEquals( XmlPullParser.IGNORABLE_WHITESPACE, parser.nextToken() ); + assertEquals( XmlPullParser.START_TAG, parser.nextToken() ); + assertEquals( "document", parser.getName() ); + assertEquals( 1, parser.getAttributeCount() ); + assertEquals( "name", parser.getAttributeName( 0 ) ); + assertEquals( "section name with entities: '&' 'Α' '<' ' ' '>' '𝟭' ''' 'ř' '\"'", + parser.getAttributeValue( 0 ) ); + + assertEquals( XmlPullParser.ENTITY_REF, parser.nextToken() ); + assertEquals( "myCustomEntity", parser.getName() ); + assertEquals( "ř", parser.getText() ); + + assertEquals( XmlPullParser.END_TAG, parser.nextToken() ); + assertEquals( XmlPullParser.END_DOCUMENT, parser.nextToken() ); + } + catch ( XmlPullParserException e ) + { + fail( "should not raise exception: " + e ); + } + } + } diff --git a/src/test/resources/xml/test-entities-in-attr.xml b/src/test/resources/xml/test-entities-in-attr.xml new file mode 100644 index 00000000..a423c995 --- /dev/null +++ b/src/test/resources/xml/test-entities-in-attr.xml @@ -0,0 +1,9 @@ + + + + + + +]> +&myCustomEntity; \ No newline at end of file diff --git a/src/test/resources/xml/test-entities.xml b/src/test/resources/xml/test-entities.xml new file mode 100644 index 00000000..e1d6d17a --- /dev/null +++ b/src/test/resources/xml/test-entities.xml @@ -0,0 +1,6 @@ + + + +]> +&myCustomEntity; \ No newline at end of file