Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix JRuby SAX parser entity handling (v1.12.x backport) #2329

Merged
merged 3 commits into from Sep 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
33 changes: 14 additions & 19 deletions ext/java/nokogiri/Html4SaxPushParser.java
@@ -1,31 +1,26 @@
package nokogiri;

import static nokogiri.XmlSaxPushParser.terminateExecution;
import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
import static org.jruby.runtime.Helpers.invoke;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.ThreadFactory;

import nokogiri.internals.*;

import nokogiri.internals.ClosedStreamException;
import nokogiri.internals.NokogiriBlockingQueueInputStream;
import nokogiri.internals.NokogiriHelpers;
import nokogiri.internals.ParserContext;
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyObject;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.exceptions.RaiseException;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.concurrent.*;

import static nokogiri.XmlSaxPushParser.terminateExecution;
import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
import static org.jruby.runtime.Helpers.invoke;

/**
* Class for Nokogiri::HTML4::SAX::PushParser
*
Expand Down Expand Up @@ -134,7 +129,7 @@ public class Html4SaxPushParser extends RubyObject

if (!options.recover && parserTask.getErrorCount() > errorCount0) {
terminateTask(context.runtime);
throw parserTask.getLastError();
throw parserTask.getLastError().toThrowable();
}

return this;
Expand Down
124 changes: 31 additions & 93 deletions ext/java/nokogiri/XmlSaxParserContext.java
@@ -1,33 +1,23 @@
package nokogiri;

import static org.jruby.runtime.Helpers.invoke;

import java.io.IOException;
import java.io.InputStream;

import nokogiri.internals.*;
import org.apache.xerces.parsers.AbstractSAXParser;
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyFixnum;
import org.jruby.RubyModule;
import org.jruby.RubyObjectAdapter;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.exceptions.RaiseException;
import org.jruby.javasupport.JavaEmbedUtils;
import org.jruby.runtime.Helpers;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.xml.sax.ContentHandler;
import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.SAXParseException;

import nokogiri.internals.NokogiriHandler;
import nokogiri.internals.NokogiriHelpers;
import nokogiri.internals.ParserContext;
import nokogiri.internals.XmlSaxParser;
import java.io.IOException;
import java.io.InputStream;

import static org.jruby.runtime.Helpers.invoke;

/**
* Base class for the SAX parsers.
Expand All @@ -51,6 +41,7 @@ public class XmlSaxParserContext extends ParserContext
protected AbstractSAXParser parser;

protected NokogiriHandler handler;
protected NokogiriErrorHandler errorHandler;
private boolean replaceEntities = true;
private boolean recovery = false;

Expand Down Expand Up @@ -168,31 +159,12 @@ public class XmlSaxParserContext extends ParserContext
return (XmlSaxParserContext) NokogiriService.XML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(runtime, klazz);
}

/**
* Set a property of the underlying parser.
*/
protected void
setProperty(String key, Object val)
throws SAXNotRecognizedException, SAXNotSupportedException
{
parser.setProperty(key, val);
}

protected void
setContentHandler(ContentHandler handler)
{
parser.setContentHandler(handler);
}

protected void
setErrorHandler(ErrorHandler handler)
{
parser.setErrorHandler(handler);
}

public final NokogiriHandler
getNokogiriHandler() { return handler; }

public final NokogiriErrorHandler
getNokogiriErrorHandler() { return errorHandler; }

/**
* Perform any initialization prior to parsing with the handler
* <code>handlerRuby</code>. Convenience hook for subclasses.
Expand Down Expand Up @@ -223,6 +195,17 @@ public class XmlSaxParserContext extends ParserContext
parser.parse(getInputSource());
}

protected static Options
defaultParseOptions(ThreadContext context)
{
return new ParserContext.Options(
RubyFixnum.fix2long(Helpers.invoke(context,
((RubyClass)context.getRuntime().getClassFromPath("Nokogiri::XML::ParseOptions"))
.getConstant("DEFAULT_XML"),
"to_i"))
);
}

@JRubyMethod
public IRubyObject
parse_with(ThreadContext context, IRubyObject handlerRuby)
Expand All @@ -233,14 +216,19 @@ public class XmlSaxParserContext extends ParserContext
throw runtime.newArgumentError("argument must respond_to document");
}

NokogiriHandler handler = this.handler = new NokogiriHandler(runtime, handlerRuby);
preParse(runtime, handlerRuby, handler);
/* TODO: how should we pass in parse options? */
ParserContext.Options options = defaultParseOptions(context);

errorHandler = new NokogiriStrictErrorHandler(runtime, options.noError, options.noWarning);
handler = new NokogiriHandler(runtime, handlerRuby, errorHandler);

setContentHandler(handler);
setErrorHandler(handler);
preParse(runtime, handlerRuby, handler);
parser.setContentHandler(handler);
parser.setErrorHandler(handler);
parser.setEntityResolver(new NokogiriEntityResolver(runtime, errorHandler, options));

try {
setProperty("http://xml.org/sax/properties/lexical-handler", handler);
parser.setProperty("http://xml.org/sax/properties/lexical-handler", handler);
} catch (Exception ex) {
throw runtime.newRuntimeError("Problem while creating XML SAX Parser: " + ex.toString());
}
Expand Down Expand Up @@ -270,8 +258,6 @@ public class XmlSaxParserContext extends ParserContext

postParse(runtime, handlerRuby, handler);

//maybeTrimLeadingAndTrailingWhitespace(context, handlerRuby);

return runtime.getNil();
}

Expand Down Expand Up @@ -319,53 +305,6 @@ public class XmlSaxParserContext extends ParserContext
return context.runtime.newBoolean(recovery);
}

/**
* If the handler's document is a FragmentHandler, attempt to trim
* leading and trailing whitespace.
*
* This is a bit hackish and depends heavily on the internals of
* FragmentHandler.
*/
protected void
maybeTrimLeadingAndTrailingWhitespace(ThreadContext context, IRubyObject parser)
{
RubyObjectAdapter adapter = JavaEmbedUtils.newObjectAdapter();
RubyModule mod = context.getRuntime().getClassFromPath("Nokogiri::XML::FragmentHandler");

IRubyObject handler = adapter.getInstanceVariable(parser, "@document");
if (handler == null || handler.isNil() || !adapter.isKindOf(handler, mod)) {
return;
}
IRubyObject stack = adapter.getInstanceVariable(handler, "@stack");
if (stack == null || stack.isNil()) {
return;
}
// doc is finally a DocumentFragment whose nodes we can check
IRubyObject doc = adapter.callMethod(stack, "first");
if (doc == null || doc.isNil()) {
return;
}

IRubyObject children;

for (;;) {
children = adapter.callMethod(doc, "children");
IRubyObject first = adapter.callMethod(children, "first");
if (NokogiriHelpers.isBlank(first)) { adapter.callMethod(first, "unlink"); }
else { break; }
}

for (;;) {
children = adapter.callMethod(doc, "children");
IRubyObject last = adapter.callMethod(children, "last");
if (NokogiriHelpers.isBlank(last)) { adapter.callMethod(last, "unlink"); }
else { break; }
}

// While we have a document, normalize it.
((XmlNode) doc).normalize();
}

@JRubyMethod(name = "column")
public IRubyObject
column(ThreadContext context)
Expand All @@ -383,5 +322,4 @@ public class XmlSaxParserContext extends ParserContext
if (number == null) { return context.getRuntime().getNil(); }
return RubyFixnum.newFixnum(context.getRuntime(), number.longValue());
}

}
42 changes: 17 additions & 25 deletions ext/java/nokogiri/XmlSaxPushParser.java
@@ -1,32 +1,24 @@
package nokogiri;

import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
import static org.jruby.runtime.Helpers.invoke;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.ThreadFactory;

import nokogiri.internals.*;
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyException;
import org.jruby.RubyObject;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.exceptions.RaiseException;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;

import nokogiri.internals.ClosedStreamException;
import nokogiri.internals.NokogiriBlockingQueueInputStream;
import nokogiri.internals.NokogiriHandler;
import nokogiri.internals.NokogiriHelpers;
import nokogiri.internals.ParserContext;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.concurrent.*;

import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
import static org.jruby.runtime.Helpers.invoke;

/**
* Class for Nokogiri::XML::SAX::PushParser
Expand Down Expand Up @@ -159,7 +151,8 @@ public class XmlSaxPushParser extends RubyObject

if (!options.recover && parserTask.getErrorCount() > errorCount0) {
terminateTask(context.runtime);
throw ex = parserTask.getLastError();
ex = parserTask.getLastError().toThrowable();
throw ex;
}

return this;
Expand Down Expand Up @@ -278,16 +271,15 @@ static class ParserTask extends ParserContext.ParserTask<XmlSaxParserContext>
getErrorCount()
{
// check for null because thread may not have started yet
if (parser.getNokogiriHandler() == null) { return 0; }
return parser.getNokogiriHandler().getErrorCount();
if (parser.getNokogiriErrorHandler() == null) { return 0; }
return parser.getNokogiriErrorHandler().getErrors().size();
}

synchronized final RaiseException
synchronized final RubyException
getLastError()
{
return parser.getNokogiriHandler().getLastError();
List<RubyException> errors = parser.getNokogiriErrorHandler().getErrors();
return errors.get(errors.size() - 1);
}

}

}
2 changes: 1 addition & 1 deletion ext/java/nokogiri/internals/NokogiriEntityResolver.java
Expand Up @@ -85,7 +85,7 @@ public class NokogiriEntityResolver implements EntityResolver2
private void
addError(String errorMessage)
{
if (handler != null) { handler.errors.add(new Exception(errorMessage)); }
if (handler != null) { handler.addError(new Exception(errorMessage)); }
}

/**
Expand Down
37 changes: 29 additions & 8 deletions ext/java/nokogiri/internals/NokogiriErrorHandler.java
@@ -1,11 +1,15 @@
package nokogiri.internals;

import java.util.ArrayList;
import java.util.List;

import nokogiri.XmlSyntaxError;
import org.apache.xerces.xni.parser.XMLErrorHandler;
import org.jruby.Ruby;
import org.jruby.RubyException;
import org.jruby.exceptions.RaiseException;
import org.xml.sax.ErrorHandler;

import java.util.ArrayList;
import java.util.List;

/**
* Super class of error handlers.
*
Expand All @@ -17,23 +21,40 @@
*/
public abstract class NokogiriErrorHandler implements ErrorHandler, XMLErrorHandler
{
protected final List<Exception> errors;
private final Ruby runtime;
protected final List<RubyException> errors;
protected boolean noerror;
protected boolean nowarning;

public
NokogiriErrorHandler(boolean noerror, boolean nowarning)
NokogiriErrorHandler(Ruby runtime, boolean noerror, boolean nowarning)
{
this.errors = new ArrayList<Exception>(4);
this.runtime = runtime;
this.errors = new ArrayList<RubyException>(4);
this.noerror = noerror;
this.nowarning = nowarning;
}

List<Exception>
public List<RubyException>
getErrors() { return errors; }

public void
addError(Exception ex) { errors.add(ex); }
addError(Exception ex)
{
addError(XmlSyntaxError.createXMLSyntaxError(runtime, ex));
}

public void
addError(RubyException ex)
{
errors.add(ex);
}

public void
addError(RaiseException ex)
{
addError(ex.getException());
}

protected boolean
usesNekoHtml(String domain)
Expand Down