Skip to content

a quick start python mutil thread crawl

License

MIT, MIT licenses found

Licenses found

MIT
LICENSE
MIT
LICENSE.rst
Notifications You must be signed in to change notification settings

litter-rabbit/lrabbit_scrapy

Repository files navigation

lrabbit_scrapy

this is a small spider,you can easy running. you don't have to redo some repeated code every time, using this small framework you can quickly crawl data into a file or database.

Requirements

python >=3.6.8

Installing

$ pip3 install lrabbit_scrapy

quick start

  • python3 -m lrabbit_scrapy new_scrapy blog
    • then will create a directory of blog,this is file content
from lrabbit_scrapy.spider import LrabbitSpider
from lrabbit_scrapy.common_utils.network_helper import RequestSession
from lrabbit_scrapy.common_utils.print_log_helper import LogUtils
from lrabbit_scrapy.common_utils.all_in_one import FileStore
import os
from lrabbit_scrapy.common_utils.mysql_helper import MysqlClient
from parsel import Selector


class Spider(LrabbitSpider):
    """
        spider_name : lrabbit blog spider
    """
    # unique spider name
    spider_name = "lrabbit_blog"
    # max thread worker numbers
    max_thread_num = 2
    # is open for every thread a mysql connection,if your max_thread_num overpass 10 and  in code need mysql query ,you need open this config
    thread_mysql_open = True
    # reset all task_list,every restart program will init task list
    reset_task_config = False
    # open loop init_task_list ,when your task is all finish,and you want again ,you can open it
    loop_task_config = False
    # remove config option,if open it,then confirm option when you init task
    remove_confirm_config = False
    # config_path_name, this is env name ,is this code ,you need in linux to execute: export config_path="crawl.ini"
    config_env_name = "config_path"
    # redis db_num
    redis_db_config = 0
    # debug log ,open tracback log
    debug_config = False

    def __init__(self):
        super().__init__()
        self.session = RequestSession()
        self.proxy_session = RequestSession(proxies=None)
        csv_path = os.path.join(os.path.abspath(os.getcwd()), f"{self.spider_name}.csv")
        self.field_names = ['id', 'title', 'datetime']
        self.blog_file = FileStore(file_path=csv_path, filed_name=self.field_names)

    def worker(self, *args):
        task = args[0]
        mysql_client: MysqlClient
        if len(args) == 2:
            mysql_client = args[1]
            # mysql_client.execute("")
        res = self.session.send_request(method='GET', url=f'http://www.lrabbit.life/post_detail/?id={task}')
        selector = Selector(res.text)
        title = selector.css(".detail-title h1::text").get()
        datetime = selector.css(".detail-info span::text").get()
        if title:
            post_data = {"id": task, "title": title, 'datetime': datetime}
            self.blog_file.write(post_data)
            # when you succes get content update redis stat
            self.update_stat_redis()
        LogUtils.log_finish(task)

    def init_task_list(self):

        # you can get init task from mysql
        # res = self.mysql_client.query("select id from rookie limit 100 ")
        # return [task['id'] for task in res]
        return [i for i in range(100)]


if __name__ == '__main__':
    spider = Spider()
    spider.run()
  • set config.ini and config env variable

    • create crawl.ini,for example this file path is /root/crawl.ini
    [server]
    mysql_user = root
    mysql_password = 123456
    mysql_database = test
    mysql_host = 192.168.1.1
    redis_user = lrabbit
    redis_host = 192.168.1.1
    redis_port = 6379
    redis_password = 123456
    
    [test]
    mysql_user = root
    mysql_password = 123456
    mysql_database = test
    mysql_host = 192.168.1.1
    redis_user = lrabbit
    redis_host = 192.168.1.1
    redis_port = 6379
    redis_password = 123456
    • set config env
      • windows power shell
      • $env:config_path = "/root/crawl.ini"
      • linux
      • export config_path="/root/crawl.ini"
  • python3 blog_spider.py

other function

  • python3 blog_spider.py stat
    • show task stat
  • python3 -m lrabbit-scrapy sslpass
    • pass android ssl

About

a quick start python mutil thread crawl

Topics

Resources

License

MIT, MIT licenses found

Licenses found

MIT
LICENSE
MIT
LICENSE.rst

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published