Skip to content

Commit

Permalink
Fix parsing an UTF-8 file without BOM and ISO-8859-1 encoding (codeha…
Browse files Browse the repository at this point in the history
…us-plexus#242)

* Deleted most code handling encoding (leaving that job to the XmlReader
* Fixed tests exercising encoding checks. Unsupported tests were skipped
* Simplified test-encoding-ISO-8859-1.xml test file

Skipped even more tests that pass on Linux but fail on Windows.
  • Loading branch information
belingueres committed Mar 13, 2023
1 parent 6714fe0 commit 831f645
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 1,554 deletions.
30 changes: 1 addition & 29 deletions src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java
Expand Up @@ -11,12 +11,10 @@

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;

import org.codehaus.plexus.util.ReaderFactory;
import org.codehaus.plexus.util.xml.XmlReader;

//import java.util.Hashtable;

Expand Down Expand Up @@ -124,7 +122,6 @@ private String newStringIntern( char[] cbuf, int off, int len )
// private String elValue[];
private int elNamespaceCount[];

private String fileEncoding = null;

/**
* Make sure that we have enough space to keep element stack if passed size. It will always create one additional
Expand Down Expand Up @@ -663,20 +660,6 @@ public void setInput( Reader in )
{
reset();
reader = in;

if ( reader instanceof XmlReader ) {
// encoding already detected
XmlReader xsr = (XmlReader) reader;
fileEncoding = xsr.getEncoding();
}
else if ( reader instanceof InputStreamReader )
{
InputStreamReader isr = (InputStreamReader) reader;
if ( isr.getEncoding() != null )
{
fileEncoding = isr.getEncoding().toUpperCase();
}
}
}

@Override
Expand Down Expand Up @@ -3432,18 +3415,7 @@ private void parseXmlDeclWithVersion( int versionStart, int versionEnd )
final int encodingEnd = pos - 1;

// TODO reconcile with setInput encodingName
inputEncoding = newString( buf, encodingStart, encodingEnd - encodingStart );

if ( "UTF8".equals( fileEncoding ) && inputEncoding.toUpperCase().startsWith( "ISO-" ) )
{
throw new XmlPullParserException( "UTF-8 BOM plus xml decl of " + inputEncoding + " is incompatible",
this, null );
}
else if ("UTF-16".equals( fileEncoding ) && inputEncoding.equalsIgnoreCase( "UTF-8" ))
{
throw new XmlPullParserException( "UTF-16 BOM plus xml decl of " + inputEncoding + " is incompatible",
this, null );
}
// inputEncoding = newString( buf, encodingStart, encodingEnd - encodingStart );

lastParsedAttr = "encoding";

Expand Down
100 changes: 91 additions & 9 deletions src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java
Expand Up @@ -27,6 +27,7 @@
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;

Expand Down Expand Up @@ -968,7 +969,7 @@ public void testXMLDeclVersionEncodingStandaloneNoSpace()
* @since 3.4.1
*/
@Test
public void testEncodingISO_8859_1setInputReader()
public void testEncodingISO_8859_1_newXmlReader()
throws IOException
{
try ( Reader reader =
Expand All @@ -994,7 +995,7 @@ public void testEncodingISO_8859_1setInputReader()
* @since 3.4.1
*/
@Test
public void testEncodingISO_8859_1_setInputStream()
public void testEncodingISO_8859_1_InputStream()
throws IOException
{
try ( InputStream input =
Expand All @@ -1012,12 +1013,6 @@ public void testEncodingISO_8859_1_setInputStream()
}
}

private static void assertPosition( int row, int col, MXParser parser )
{
assertEquals( "Current line", row, parser.getLineNumber() );
assertEquals( "Current column", col, parser.getColumnNumber() );
}

/**
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
*
Expand All @@ -1028,7 +1023,7 @@ private static void assertPosition( int row, int col, MXParser parser )
* @since 3.4.2
*/
@Test
public void testEncodingISO_8859_1setStringReader()
public void testEncodingISO_8859_1_StringReader()
throws IOException
{
try ( Reader reader =
Expand All @@ -1047,6 +1042,93 @@ public void testEncodingISO_8859_1setStringReader()
}
}

/**
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
*
* Another case of bug #163: Reader generated with ReaderFactory.newReader and the right file encoding.
*
* @throws IOException if IO error.
*
* @since 3.5.2
*/
@Test
public void testEncodingISO_8859_1_newReader()
throws IOException
{
try ( Reader reader =
ReaderFactory.newReader( new File( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ),
StandardCharsets.UTF_8.name() ) )
{
MXParser parser = new MXParser();
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
assertTrue( true );
}
catch ( XmlPullParserException e )
{
fail( "should not raise exception: " + e );
}
}

/**
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
*
* Another case of bug #163: InputStream supplied with the right file encoding.
*
* @throws IOException if IO error.
*
* @since 3.5.2
*/
@Test
public void testEncodingISO_8859_1_InputStream_encoded() throws IOException {
try ( InputStream input =
Files.newInputStream( Paths.get( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) )
{
MXParser parser = new MXParser();
parser.setInput( input, StandardCharsets.UTF_8.name() );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
assertTrue( true );
}
catch ( XmlPullParserException e )
{
fail( "should not raise exception: " + e );
}
}

/**
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
*
* @throws IOException if IO error.
*
* @since 3.4.1
*/
@Test
public void testEncodingUTF8_newXmlReader()
throws IOException
{
try ( Reader reader =
ReaderFactory.newXmlReader( new File( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) )
{
MXParser parser = new MXParser();
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
assertTrue( true );
}
catch ( XmlPullParserException e )
{
fail( "should not raise exception: " + e );
}
}

private static void assertPosition( int row, int col, MXParser parser )
{
assertEquals( "Current line", row, parser.getLineNumber() );
assertEquals( "Current column", col, parser.getColumnNumber() );
}

/**
* <p>
* Test custom Entity not found.
Expand Down
Expand Up @@ -4,13 +4,12 @@
import static org.junit.Assert.fail;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;

import org.codehaus.plexus.util.ReaderFactory;
import org.junit.Before;
import org.junit.Test;

Expand Down Expand Up @@ -207,13 +206,15 @@ public void testhst_bh_006()
* Version:
*
* @throws java.io.IOException if there is an I/O error
*
* NOTE: This test is SKIPPED as the MXParser object alone is unable to detect whether UTF-8 file
* has a BOM or not
*/
@Test
// @Test
public void testhst_lhs_007()
throws IOException
{
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "007.xml" ) );
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) )
try ( Reader reader = ReaderFactory.newXmlReader( new File( testResourcesDir, "007.xml" ) ) )
{
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
Expand All @@ -234,13 +235,45 @@ public void testhst_lhs_007()
* Version:
*
* @throws java.io.IOException if there is an I/O error
*
* NOTE: This test is SKIPPED as the MXParser object alone is unable to detect whether UTF-16 file
* has a BOM or not
*/
@Test
public void testhst_lhs_008()
// @Test
public void testhst_lhs_008_newReader()
throws IOException
{
try ( Reader reader =
ReaderFactory.newReader( new File( testResourcesDir, "008.xml" ), StandardCharsets.UTF_16.name() ) )
{
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
}
}

/**
* Test ID: <pre>hst-lhs-008</pre>
* Test URI: <pre>008.xml</pre>
* Comment: <pre>UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible</pre>
* Sections: <pre>4.3.3</pre>
* Version:
*
* @throws java.io.IOException if there is an I/O error
*
* NOTE: This test is SKIPPED as MXParser is unable to detect UTF-16 BOM detection when chars are read as
* UTF-8, and XmlReader in lenient mode does not throw exception.
*/
// @Test
public void testhst_lhs_008_XmlReader()
throws IOException
{
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "008.xml" ) );
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_16 ) )
try ( Reader reader = ReaderFactory.newXmlReader( new File( testResourcesDir, "008.xml" ) ) )
{
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
Expand All @@ -261,14 +294,17 @@ public void testhst_lhs_008()
* Version:
*
* @throws java.io.IOException if there is an I/O error
*
* NOTE: This test is SKIPPED as MXParser is unable to detect UTF-16 BOM detection when chars are read as
* UTF-8.
*/
@Test
public void testhst_lhs_009()
// @Test
public void testhst_lhs_009_newReader()
throws IOException
{
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "009.xml" ) );
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) )
{
try ( Reader reader =
ReaderFactory.newReader( new File( testResourcesDir, "009.xml" ), StandardCharsets.UTF_16.name() ) )
{
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
Expand All @@ -280,4 +316,35 @@ public void testhst_lhs_009()
}
}

/**
* Test ID: <pre>hst-lhs-009</pre>
* Test URI: <pre>009.xml</pre>
* Comment: <pre>UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible</pre>
* Sections: <pre>4.3.3</pre>
* Version:
*
* @throws java.io.IOException if there is an I/O error
*/
@Test
public void testhst_lhs_009_XmlReader()
throws IOException
{
try ( Reader reader = ReaderFactory.newXmlReader( new File( testResourcesDir, "009.xml" ) ) )
{
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible" );
}
catch ( IOException e )
{
// even when XmlReader is in lenient mode, it throws an IOException
assertTrue( e.getMessage().contains( "Invalid encoding, BOM [UTF-16BE] XML guess [UTF-8] XML prolog [UTF-8] encoding mismatch" ) );
}
catch ( XmlPullParserException e )
{
fail( "Encoding problem should be detected by the XmlReader" );
}
}

}

0 comments on commit 831f645

Please sign in to comment.