Skip to content

Commit

Permalink
Fixed regressions:
Browse files Browse the repository at this point in the history
* #163 - new case:  Don't assume UTF8 as default, to allow parsing from String.
* #194 - Incorrect getText() after parsing the DOCDECL section.
* Added tests exercising other regressions exposed while fixing this issues.
  • Loading branch information
belingueres committed Apr 14, 2022
1 parent 3896620 commit 9dc1610
Show file tree
Hide file tree
Showing 6 changed files with 635 additions and 44 deletions.
154 changes: 110 additions & 44 deletions src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java
Expand Up @@ -124,7 +124,7 @@ private String newStringIntern( char[] cbuf, int off, int len )
// private String elValue[];
private int elNamespaceCount[];

private String fileEncoding = "UTF8";
private String fileEncoding = null;

/**
* Make sure that we have enough space to keep element stack if passed size. It will always create one additional
Expand Down Expand Up @@ -587,8 +587,8 @@ else if ( FEATURE_XML_ROUNDTRIP.equals( name ) )
}
}

/**
* Unknown properties are <strong>always</strong> returned as false
/**
* Unknown properties are <strong>always</strong> returned as false
*/
@Override
public boolean getFeature( String name )
Expand Down Expand Up @@ -1596,11 +1596,11 @@ else if ( ch == '&' )
}
final int oldStart = posStart + bufAbsoluteStart;
final int oldEnd = posEnd + bufAbsoluteStart;
final char[] resolvedEntity = parseEntityRef();
parseEntityRef();
if ( tokenize )
return eventType = ENTITY_REF;
// check if replacement text can be resolved !!!
if ( resolvedEntity == null )
if ( resolvedEntityRefCharBuf == BUF_NOT_RESOLVED )
{
if ( entityRefName == null )
{
Expand Down Expand Up @@ -1628,7 +1628,7 @@ else if ( ch == '&' )
}
// assert usePC == true;
// write into PC replacement text - do merge for replacement text!!!!
for ( char aResolvedEntity : resolvedEntity )
for ( char aResolvedEntity : resolvedEntityRefCharBuf )
{
if ( pcEnd >= pc.length )
{
Expand Down Expand Up @@ -2675,9 +2675,28 @@ else if ( ch == '\t' || ch == '\n' || ch == '\r' )
return ch;
}

private char[] charRefOneCharBuf = new char[1];
// state representing that no entity ref have been resolved
private static final char[] BUF_NOT_RESOLVED = new char[0];

// predefined entity refs
private static final char[] BUF_LT = new char[] { '<' };
private static final char[] BUF_AMP = new char[] { '&' };
private static final char[] BUF_GT = new char[] { '>' };
private static final char[] BUF_APO = new char[] { '\'' };
private static final char[] BUF_QUOT = new char[] { '"' };

private char[] parseEntityRef()
private char[] resolvedEntityRefCharBuf = BUF_NOT_RESOLVED;

/**
* parse Entity Ref, either a character entity or one of the predefined name entities.
*
* @return the length of the valid found character reference, which may be one of the predefined character reference
* names (resolvedEntityRefCharBuf contains the replaced chars). Returns the length of the not found entity
* name, otherwise.
* @throws XmlPullParserException if invalid XML is detected.
* @throws IOException if an I/O error is found.
*/
private int parseCharOrPredefinedEntityRef()
throws XmlPullParserException, IOException
{
// entity reference http://www.w3.org/TR/2000/REC-xml-20001006#NT-Reference
Expand All @@ -2686,6 +2705,8 @@ private char[] parseEntityRef()
// ASSUMPTION just after &
entityRefName = null;
posStart = pos;
int len = 0;
resolvedEntityRefCharBuf = BUF_NOT_RESOLVED;
char ch = more();
if ( ch == '#' )
{
Expand Down Expand Up @@ -2750,7 +2771,6 @@ else if ( ch >= 'A' && ch <= 'F' )
ch = more();
}
}
posEnd = pos - 1;

boolean isValidCodePoint = true;
try
Expand All @@ -2759,7 +2779,7 @@ else if ( ch >= 'A' && ch <= 'F' )
isValidCodePoint = isValidCodePoint( codePoint );
if ( isValidCodePoint )
{
charRefOneCharBuf = Character.toChars( codePoint );
resolvedEntityRefCharBuf = Character.toChars( codePoint );
}
}
catch ( IllegalArgumentException e )
Expand All @@ -2775,14 +2795,14 @@ else if ( ch >= 'A' && ch <= 'F' )

if ( tokenize )
{
text = newString( charRefOneCharBuf, 0, charRefOneCharBuf.length );
text = newString( resolvedEntityRefCharBuf, 0, resolvedEntityRefCharBuf.length );
}
return charRefOneCharBuf;
len = resolvedEntityRefCharBuf.length;
}
else
{
// [68] EntityRef ::= '&' Name ';'
// scan anem until ;
// scan name until ;
if ( !isNameStartChar( ch ) )
{
throw new XmlPullParserException( "entity reference names can not start with character '"
Expand All @@ -2801,17 +2821,15 @@ else if ( ch >= 'A' && ch <= 'F' )
+ printable( ch ) + "'", this, null );
}
}
posEnd = pos - 1;
// determine what name maps to
final int len = posEnd - posStart;
len = ( pos - 1 ) - posStart;
if ( len == 2 && buf[posStart] == 'l' && buf[posStart + 1] == 't' )
{
if ( tokenize )
{
text = "<";
}
charRefOneCharBuf[0] = '<';
return charRefOneCharBuf;
resolvedEntityRefCharBuf = BUF_LT;
// if(paramPC || isParserTokenizing) {
// if(pcEnd >= pc.length) ensurePC();
// pc[pcEnd++] = '<';
Expand All @@ -2823,17 +2841,15 @@ else if ( len == 3 && buf[posStart] == 'a' && buf[posStart + 1] == 'm' && buf[po
{
text = "&";
}
charRefOneCharBuf[0] = '&';
return charRefOneCharBuf;
resolvedEntityRefCharBuf = BUF_AMP;
}
else if ( len == 2 && buf[posStart] == 'g' && buf[posStart + 1] == 't' )
{
if ( tokenize )
{
text = ">";
}
charRefOneCharBuf[0] = '>';
return charRefOneCharBuf;
resolvedEntityRefCharBuf = BUF_GT;
}
else if ( len == 4 && buf[posStart] == 'a' && buf[posStart + 1] == 'p' && buf[posStart + 2] == 'o'
&& buf[posStart + 3] == 's' )
Expand All @@ -2842,8 +2858,7 @@ else if ( len == 4 && buf[posStart] == 'a' && buf[posStart + 1] == 'p' && buf[po
{
text = "'";
}
charRefOneCharBuf[0] = '\'';
return charRefOneCharBuf;
resolvedEntityRefCharBuf = BUF_APO;
}
else if ( len == 4 && buf[posStart] == 'q' && buf[posStart + 1] == 'u' && buf[posStart + 2] == 'o'
&& buf[posStart + 3] == 't' )
Expand All @@ -2852,21 +2867,60 @@ else if ( len == 4 && buf[posStart] == 'q' && buf[posStart + 1] == 'u' && buf[po
{
text = "\"";
}
charRefOneCharBuf[0] = '"';
return charRefOneCharBuf;
}
else
{
final char[] result = lookuEntityReplacement( len );
if ( result != null )
{
return result;
}
resolvedEntityRefCharBuf = BUF_QUOT;
}
if ( tokenize )
text = null;
return null;
}

posEnd = pos;

return len;
}

/**
* Parse an entity reference inside the DOCDECL section.
*
* @throws XmlPullParserException if invalid XML is detected.
* @throws IOException if an I/O error is found.
*/
private void parseEntityRefInDocDecl()
throws XmlPullParserException, IOException
{
parseCharOrPredefinedEntityRef();
if (usePC) {
posStart--; // include in PC the starting '&' of the entity
joinPC();
}

if ( resolvedEntityRefCharBuf != BUF_NOT_RESOLVED )
return;
if ( tokenize )
text = null;
}

/**
* Parse an entity reference inside a tag or attribute.
*
* @throws XmlPullParserException if invalid XML is detected.
* @throws IOException if an I/O error is found.
*/
private void parseEntityRef()
throws XmlPullParserException, IOException
{
final int len = parseCharOrPredefinedEntityRef();

posEnd--; // don't involve the final ';' from the entity in the search

if ( resolvedEntityRefCharBuf != BUF_NOT_RESOLVED ) {
return;
}

resolvedEntityRefCharBuf = lookuEntityReplacement( len );
if ( resolvedEntityRefCharBuf != BUF_NOT_RESOLVED )
{
return;
}
if ( tokenize )
text = null;
}

/**
Expand All @@ -2883,8 +2937,6 @@ private static boolean isValidCodePoint( int codePoint )
}

private char[] lookuEntityReplacement( int entityNameLen )
throws XmlPullParserException, IOException

{
if ( !allStringsInterned )
{
Expand Down Expand Up @@ -2919,7 +2971,7 @@ private char[] lookuEntityReplacement( int entityNameLen )
}
}
}
return null;
return BUF_NOT_RESOLVED;
}

private void parseComment()
Expand Down Expand Up @@ -2977,7 +3029,7 @@ else if (isValidCodePoint( ch ))
}
else
{
throw new XmlPullParserException( "Illegal character 0x" + Integer.toHexString(((int) ch)) + " found in comment", this, null );
throw new XmlPullParserException( "Illegal character 0x" + Integer.toHexString((ch)) + " found in comment", this, null );
}
if ( normalizeIgnorableWS )
{
Expand Down Expand Up @@ -3484,7 +3536,8 @@ else if ( ch == '>' && bracketLevel == 0 )
break;
else if ( ch == '&' )
{
extractEntityRef();
extractEntityRefInDocDecl();
continue;
}
if ( normalizeIgnorableWS )
{
Expand Down Expand Up @@ -3536,6 +3589,19 @@ else if ( ch == '\n' )

}
posEnd = pos - 1;
text = null;
}

private void extractEntityRefInDocDecl()
throws XmlPullParserException, IOException
{
// extractEntityRef
posEnd = pos - 1;

int prevPosStart = posStart;
parseEntityRefInDocDecl();

posStart = prevPosStart;
}

private void extractEntityRef()
Expand All @@ -3559,9 +3625,9 @@ private void extractEntityRef()
}
// assert usePC == true;

final char[] resolvedEntity = parseEntityRef();
parseEntityRef();
// check if replacement text can be resolved !!!
if ( resolvedEntity == null )
if ( resolvedEntityRefCharBuf == BUF_NOT_RESOLVED )
{
if ( entityRefName == null )
{
Expand All @@ -3571,7 +3637,7 @@ private void extractEntityRef()
+ "'", this, null );
}
// write into PC replacement text - do merge for replacement text!!!!
for ( char aResolvedEntity : resolvedEntity )
for ( char aResolvedEntity : resolvedEntityRefCharBuf )
{
if ( pcEnd >= pc.length )
{
Expand Down

0 comments on commit 9dc1610

Please sign in to comment.