Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 400 Bad Request'))) #6585

Open
12tilak34 opened this issue Nov 26, 2023 · 0 comments

Comments

@12tilak34
Copy link

12tilak34 commented Nov 26, 2023

I'm developing a program that scrapes news data using the requests library but while running I get this error. my proxy list is working correctly

error: Proxy: 103.21.244.100:80 - Error: HTTPSConnectionPool(host='www.nytimes.com', port=443): Max retries exceeded with url: /section/business/media (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 400 Bad Request')))

proxylist.txt

Reproduction Steps

import sys
import time
import psycopg2
import newspaper
from PyQt5.QtWidgets import QApplication, QMainWindow, QPushButton
from threading import Thread
import requests


class NewsScraperGUI(QMainWindow):
    def __init__(self):
        super().__init__()
        self.initUI()
        self.scraping = False  # Flag to control scraping process

    def initUI(self):
        self.setWindowTitle('News Scraper')
        self.setGeometry(100, 100, 400, 200)

        self.start_button = QPushButton('Start Scraping', self)
        self.start_button.clicked.connect(self.startScraping)
        self.start_button.setGeometry(50, 50, 150, 30)

        self.stop_button = QPushButton('Stop Scraping', self)
        self.stop_button.clicked.connect(self.stopScraping)
        self.stop_button.setGeometry(200, 50, 150, 30)
        self.stop_button.setEnabled(False)  # Initially disabled

    def create_table(self):
        conn = psycopg2.connect(
            dbname='postgres',
            user='postgres',
            password='12Tilak34##',
            host='localhost'
        )
        cur = conn.cursor()
        cur.execute('''
            CREATE TABLE IF NOT EXISTS news (
                id SERIAL PRIMARY KEY,
                source TEXT,
                title TEXT,
                text TEXT,
                url TEXT
            );
        ''')
        conn.commit()
        conn.close()

    def insert_news(self, source, title, text, url):
        conn = psycopg2.connect(
            dbname='postgres',
            user='postgres',
            password='12Tilak34##',
            host='localhost'
        )
        cur = conn.cursor()
        cur.execute('''
            INSERT INTO news (source, title, text, url) VALUES (%s, %s, %s, %s);
        ''', (source, title, text, url))
        conn.commit()
        conn.close()

    def read_proxies(self, filename):
        try:
            with open(filename, 'r') as file:
                proxies = file.readlines()
                # Removing newline characters and any extra whitespace
                proxies = [proxy.strip() for proxy in proxies if proxy.strip()]
                return proxies
        except Exception as e:
            print(f"Error reading proxies from file: {e}")
            return []
    def scrape_news(self, source_url, num_articles, proxies_filename):
        proxies = self.read_proxies(proxies_filename)

        try:
            source = newspaper.build(source_url, memoize_articles=False)
            source.download()
            source.parse()

            articles_scraped = 0
            for article in source.articles:
                if articles_scraped >= num_articles:
                    break

                proxy = proxies.pop(0) if proxies else None  # Get the first proxy from the list
                if proxy:
                    proxies.append(proxy)  # Rotate proxies

                article_url = article.url
                article_title = article.title
                article_text = article.text

                try:
                    if proxy:
                        # Using requests library to send the request with the selected proxy
                        response = requests.get(article_url, proxies={"http": proxy, "https": proxy}, timeout=10)

                        if response.status_code == 200:
                            # If the request was successful, insert news and print proxy used
                            self.insert_news(source_url, article_title, article_text, article_url)
                            print(f"Proxy: {proxy} - Success: {article_url}")
                        else:
                            print(f"Proxy: {proxy} - Failed: {article_url}")

                    else:
                        print("No proxy available for request.")

                except Exception as e:
                    print(f"Proxy: {proxy} - Error: {e}")

                articles_scraped += 1

        except Exception as e:
            print(f"Error scraping from {source_url}: {e}")

    def startScraping(self):
        self.scraping = True
        self.start_button.setEnabled(False)
        self.stop_button.setEnabled(True)

        self.create_table()
        proxies_filename = "proxylist.txt"

        # Define websites and the number of articles to scrape from each
        websites = [
            {'url': 'http://ft.com', 'num_articles': 5},
            {'url': 'http://nytimes.com', 'num_articles': 5},
            {'url': 'http://www.bloomberg.com/economics', 'num_articles': 5},
            {'url': 'http://economictimes.indiatimes.com', 'num_articles': 5},
        ]

        self.threads = []
        for site in websites:
            thread = Thread(target=self.scrape_news, args=(site['url'], site['num_articles'],proxies_filename))
            self.threads.append(thread)
            thread.start()

    def stopScraping(self):
        self.scraping = False
        self.start_button.setEnabled(True)
        self.stop_button.setEnabled(False)

        for thread in self.threads:
            thread.join()


def main():
    app = QApplication(sys.argv)
    window = NewsScraperGUI()
    window.show()
    sys.exit(app.exec_())

if __name__ == '__main__':
    main()

System Information

$ python -m requests.help
C:\Python\Lib\site-packages\requests\__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.15) or chardet (5.2.0)/charset_normalizer (2.0.12) doesn't match a supported version!
  warnings.warn("urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported "
C:\Python\Lib\site-packages\requests\help.py:25: DeprecationWarning: 'urllib3.contrib.pyopenssl' module is deprecated and will be removed in a future release of urllib3 2.x. Read more in this issue: https://github.com/urllib3/urllib3/issues/2680
  from urllib3.contrib import pyopenssl
{
  "chardet": {
    "version": "5.2.0"
  },
  "charset_normalizer": {
    "version": "2.0.12"
  },
  "cryptography": {
    "version": "41.0.1"
  },
  "idna": {
    "version": "3.4"
  },
  "implementation": {
    "name": "CPython",
    "version": "3.11.3"
  },
  "platform": {
    "release": "10",
    "system": "Windows"
  },
  "pyOpenSSL": {
    "openssl_version": "30100010",
    "version": "23.2.0"
  },
  "requests": {
    "version": "2.26.0"
  },
  "system_ssl": {
    "version": "1010114f"
  },
  "urllib3": {
    "version": "1.26.15"
  },
  "using_charset_normalizer": false,
  "using_pyopenssl": true
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant