Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for issue 6047 #6176

Closed
wants to merge 7 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
93 changes: 78 additions & 15 deletions scrapy/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,32 @@ def _apply_settings(self) -> None:
"Overridden settings:\n%(settings)s", {"settings": pprint.pformat(d)}
)

# @inlineCallbacks
# def crawl(self, *args: Any, **kwargs: Any) -> Generator[Deferred, Any, None]:
# if self.crawling:
# raise RuntimeError("Crawling already taking place")
# if self._started:
# warnings.warn(
# "Running Crawler.crawl() more than once is deprecated.",
# ScrapyDeprecationWarning,
# stacklevel=2,
# )
# self.crawling = self._started = True

# try:
# self.spider = self._create_spider(*args, **kwargs)
# self._apply_settings()
# self._update_root_log_handler()
# self.engine = self._create_engine()
# start_requests = iter(self.spider.start_requests())
# yield self.engine.open_spider(self.spider, start_requests)
# yield maybeDeferred(self.engine.start)
# except Exception:
# self.crawling = False
# if self.engine is not None:
# yield self.engine.close()
# raise

@inlineCallbacks
def crawl(self, *args: Any, **kwargs: Any) -> Generator[Deferred, Any, None]:
if self.crawling:
Expand All @@ -150,20 +176,26 @@ def crawl(self, *args: Any, **kwargs: Any) -> Generator[Deferred, Any, None]:
)
self.crawling = self._started = True

try:
self.spider = self._create_spider(*args, **kwargs)
self._apply_settings()
self._update_root_log_handler()
self.engine = self._create_engine()
start_requests = iter(self.spider.start_requests())
yield self.engine.open_spider(self.spider, start_requests)
yield maybeDeferred(self.engine.start)
except Exception:
self.crawling = False
if self.engine is not None:
yield self.engine.close()
raise
def errorHandler(failure, self):
failure.trap(Exception)
self.crawling = False
if self.engine is not None:
yield self.engine.close()
raise

def crawlHelper(self, *args: Any, **kwargs: Any):
self.spider = self._create_spider(*args, **kwargs)
self._apply_settings()
self._update_root_log_handler()
self.engine = self._create_engine()
start_requests = iter(self.spider.start_requests())
yield self.engine.open_spider(self.spider, start_requests)
yield maybeDeferred(self.engine.start)

crawl.addCallback(crawlHelper)
crawl.addErrback(errorHandler)


def _create_spider(self, *args: Any, **kwargs: Any) -> Spider:
return self.spidercls.from_crawler(self, *args, **kwargs)

Expand Down Expand Up @@ -243,13 +275,44 @@ def crawl(

:param kwargs: keyword arguments to initialize the spider
"""
# if isinstance(crawler_or_spidercls, Spider):
# raise ValueError(
# "The crawler_or_spidercls argument cannot be a spider object, "
# "it must be a spider class (or a Crawler object)"
# )
# crawler = self.create_crawler(crawler_or_spidercls)
# return self._crawl(crawler, *args, **kwargs)

# # one method
# def createInstance():
# if isinstance(crawler_or_spidercls, Spider):
# raise ValueError(
# "The crawler_or_spidercls argument cannot be a spider object, "
# "it must be a spider class (or a Crawler object)"
# )
# def errorHandler(failure):
# failure.trap(ValueError)

# self.addCallBack(createInstance)
# self.addErrback(errorHandler)

# crawler = self.create_crawler(crawler_or_spidercls)

if isinstance(crawler_or_spidercls, Spider):
raise ValueError(
"The crawler_or_spidercls argument cannot be a spider object, "
"it must be a spider class (or a Crawler object)"
)
crawler = self.create_crawler(crawler_or_spidercls)
return self._crawl(crawler, *args, **kwargs)
def errorHandler(failure):
failure.trap(ValueError)

def crawlHelper(self, crawler_or_spidercls: Union[Type[Spider], str, Crawler], *args: Any, **kwargs: Any):
crawler = self.create_crawler(crawler_or_spidercls)
return self._crawl(crawler, *args, **kwargs)

crawl.addCallback(crawlHelper)
crawl.addErrback(errorHandler)


def _crawl(self, crawler: Crawler, *args: Any, **kwargs: Any) -> Deferred:
self.crawlers.add(crawler)
Expand Down