Python网络抓取工具在输入的520个网址处冻结。它怎么了?

问题描述 投票:1回答:2
  1. 示例urls.csv将被馈送到网络抓取工具。
Main,Income statement,Balance sheet,Cash flows
https://www.investing.com/equities/vical-inc,https://www.investing.com/equities/vical-inc-income-statement,https://www.investing.com/equities/vical-inc-balance-sheet,https://www.investing.com/equities/vical-inc-cash-flow
  1. ncav.py是实际的网页抓取工具。
from os import system, name

from time import sleep

from csv import reader, writer

from ncavfunctions import income, flashbalance


def clear():
    """Clears GUI.

    """
    if name == 'nt':
        _ = system('cls')
    else:
        _ = system('clear')

# Try to read urls.csv. country is a list of lists of strings
try:
    with open('urls.csv', 'r', newline='') as csvfile:
        csv_reader = reader(csvfile)
        next(csv_reader)    # Skip Headers
        country = []
        for line in csv_reader:
            country.append(
                [line[0],
                line[1],
                line[2],
                line[3]])
    print("0. urls.csv loaded")
except:
    print("Error with urls.csv file!")
    sleep(5)

# Construct country_ncav a list of tuples of strings
country_ncav = []
i = 1
for line in country:
    clear()
    loading_perc = i * 100 / len(country)
    print("Processed {0:.2f}".format(loading_perc), "% of urls")
    print("Processing...")
    i = i + 1
    try:
        lst = \
        income(line[1])\
        + flashbalance(line[2])
        country_ncav.append(lst)
    except:
        country_ncav.append(["Unknown Error"])

# Save a csv log of country ncav items.
header = ['Name', 'Shares', 'Last price',
'Total current assets', 'Total Liabilities']
with open('flashncav.csv', 'w', newline='') as csvfile:
    csv_writer = writer(csvfile)
    csv_writer.writerow(header)
    csv_writer.writerows(country_ncav)

print("Closing in 5 seconds")
sleep(4)
print("Enjoy!")
sleep(1)
  1. ncavfunctions模块如下。
"""investing.com rejects get requests not identifying a User-Agent
1. Copy url to clipboard
2. Open Google Chrome, right click open space and click inspect
3. In Dev window click Network Tab
4. Paste url in Address Bar and press Enter, wait fully load
5. At Name window click info, on the right click Headers
6. Scroll Down to User-Agent and copy
7. Paste it between "" after "User-Agent": in var headers
8. Continue lines as needed

Parsing for investing.com
html.parser : prettify() encoding issues
lxml        : prettify() encoding issues
lxml-xml    : prettify() working but how grab from xml?
xml         : prettify() working but how grab from xml?
html5lib    : prettify() encoding isssues
Using html5, prettify() doesnt work due to encoding issues, but
i can grab the elements i want from soup element
"""
from requests import get

from bs4 import BeautifulSoup as soup


headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10\
_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 \
Safari/537.36"}

def indusector(url_m):
    """Returns industry and sector from all known types of mains.

    Keyword arguments:
    url_m -- The url of company's main page.
    """
    global inse
    try:
        inse = indusector_a(url_m)
    except:
        inse = (
            "Error main page or data N/A",
            "")
    return inse

def indusector_a(url_m):
    """Returns industry and sector of company from type a main url.

    url_m -- The url of company's main page.
    """
    resp = get(url_m, headers=headers)
    page = soup(resp.content, "html5lib")
    cont_a = page.find("div", class_="companyProfileHeader")
    cont_b = cont_a.find_all("a")
    industry = cont_b[0].string
    sector = cont_b[1].string
    return industry, sector

def income(url_i):
    """Returns all NCAV items from all known type income statements.

    Keyword arguments:
    url_i -- The url of the Income statement.
    """
    try:
        inc = income_a(url_i)
    except:
        try:
            inc = income_b(url_i)
        except:
            inc = (
                "Error income statement or data n/a",
                "",
                "")
    return inc

def income_a(url_i):
    """Returns all NCAV items from income statements type a.

    Keyword arguments:
    url_i -- The url of the Income statement.
    """
    resp = get(url_i, headers=headers)
    page = soup(resp.content, "html5lib")

    # Grab secondaries from js dialog box and Diluted weighted average
    # shares
    cont_a = page.find_all("tbody")
    cont_b = cont_a[2].find_all("tr")
    cont_c = cont_b[31].find_all("td")
    shares = cont_c[1].string

    # Grab last price
    cont_a = page.find(id="last_last")
    lprice = cont_a.string

    # Grab Name
    cont_a = page.find("h1", class_="float_lang_base_1 relativeAttr")
    namet = cont_a.string
    tcut = len(namet) - 1
    name = namet[0:tcut]

    return name, shares, lprice

def income_b(url_i):
    """Returns all NCAV items from income statements type b.

    Keyword arguments:
    url_i -- The url of the Income statement.
    """
    resp = get(url_i, headers=headers)
    page = soup(resp.content, "html5lib")
    cont_a = page.find_all("tbody")
    cont_b = cont_a[1].find_all("tr")  # [1]vs[2] is the difference
    cont_c = cont_b[31].find_all("td") # between the 2 types.
    shares = cont_c[1].string
    cont_a = page.find(id="last_last")
    lprice = cont_a.string
    cont_a = page.find("h1", class_="float_lang_base_1 relativeAttr")
    namet = cont_a.string
    tcut = len(namet) - 1
    name = namet[0:tcut]
    return name, shares, lprice

def balance(url_b):
    """Returns all NCAV items from all known type Balance sheets.

    Keyword arguments:
    url_b -- The url of the Balance sheet.
    """
    try:
        bal = balance_a(url_b)
    except:
        bal = (
            "Error balance sheet or data n/a",
            "", "",
            "", "",
            "", "")
    return bal

def balance_a(url_b):
    """Returns all NCAV items from Balance sheet type a.

    Keyword arguments:
    url_b -- The url of the Balance sheet.
    """
    resp = get(url_b, headers=headers)
    page = soup(resp.content, "html5lib")

    # Grab bolds of js dialog box
    cont_a = page.find_all(id="parentTr")

    # Grab last total current assets
    cont_b = cont_a[0].find_all("td")
    last_tot_curr_ass = cont_b[1].string

    # Grab last total liabilities
    cont_b = cont_a[3].find_all("td")
    last_tot_liabs = cont_b[1].string

    # Grab secondaries of js dialog box
    cont_a = page.find_all("tr", class_="child")

    # Grab last cash
    cont_b = cont_a[1].find_all("td")
    last_cash = cont_b[1].string

    # Grab last cash & equivalents
    cont_b = cont_a[2].find_all("td")
    last_casnnequins = cont_b[1].string

    # Grab accounts receivables
    cont_b = cont_a[5].find_all("td")
    last_accreceivs = cont_b[1].string

    # Grab last inventory
    cont_b = cont_a[6].find_all("td")
    last_invs = cont_b[1].string

    # Grab last total debt
    cont_b = cont_a[27].find_all("td")
    last_tot_dts = (cont_b[1].string)

    return (
        last_tot_curr_ass, last_cash,
        last_casnnequins, last_accreceivs,
        last_invs, last_tot_liabs,
        last_tot_dts)

def cashflow(url_c):
    """Returns opcash and capex from Statement of cash flows all types.

    Keyword arguments:
    url_c -- The url of the Statement of cash flows.
    """
    try:
        cas = cashflow_a(url_c)
    except:
        cas = (
            "Error cash flow statement or data n/a", "",
            "", "",
            "", "",
            "", "")

    return cas

def cashflow_a(url_c):
    """Returns opcash and capex from Statement of cash flows type a.

    Keyword arguments:
    url_c -- The url of the Statement of cash flows.
    """
    resp = get(url_c, headers=headers)
    page = soup(resp.content, "html5lib")

    # Grab bolds of js dialog box and incremental operating income
    cont_a = page.find_all(id="parentTr")

    cont_b = cont_a[0].find_all("td")
    incr_opcash_4 = cont_b[1].string
    incr_opcash_3 = cont_b[2].string
    incr_opcash_2 = cont_b[3].string
    incr_opcash_1 = cont_b[4].string

    # Grab secondaries of js dialog box and incremental capital
    # expenditures
    cont_a = page.find_all("tr", class_="child")
    cont_b = cont_a[9].find_all("td")
    incr_capex_4 = cont_b[1].string
    incr_capex_3 = cont_b[2].string
    incr_capex_2 = cont_b[3].string
    incr_capex_1 = cont_b[4].string

    return (
        incr_opcash_4, incr_opcash_3,
        incr_opcash_2, incr_opcash_1,
        incr_capex_4, incr_capex_3,
        incr_capex_2, incr_capex_1)

def lastprice(url_i):
    """Returns last price from all known type income statements.

    Keyword arguments:
    url_i -- The url of the Income statement.
    """
    try:
        lprice = lastprice_a(url_i)
    except:
        try:
            lprice = lastprice_b(url_i)
        except:
            lprice = ("iError", "iError")
    return lprice

def lastprice_a(url_i):
    """Returns last price from income statements type a.

    Keyword arguments:
    url_i -- The url of the Income statement.
    """
    resp = get(url_i, headers=headers)
    page = soup(resp.content, "html5lib")

    # Grab last price
    cont_a = page.find(id="last_last")
    lprice = cont_a.string

    return lprice

def flashbalance(url_b):
    """Returns all NCAV items from all known type Balance sheets.

    Keyword arguments:
    url_b -- The url of the Balance sheet.
    """
    try:
        flashbal = flashbalance_a(url_b)
    except:
        flashbal = ("Error balance sheet or data n/a", "")
    return flashbal

def flashbalance_a(url_b):
    """Returns all NCAV items from Balance sheet type a.

    Keyword arguments:
    url_b -- The url of the Balance sheet.
    """
    resp = get(url_b, headers=headers)
    page = soup(resp.content, "html5lib")

    # Grab bolds of js dialog box
    cont_a = page.find_all(id="parentTr")

    # Grab last total current assets
    cont_b = cont_a[0].find_all("td")
    last_tot_curr_ass = cont_b[1].string

    # Grab last total liabilities
    cont_b = cont_a[3].find_all("td")
    last_tot_liabs = cont_b[1].string

    return (last_tot_curr_ass, last_tot_liabs)
  1. 示例输出flashncav.csv
Name,Shares,Last price,Total current assets,Total Liabilities
Vical Inc (BBI),2.94,1.4700,30.73,13.14
  1. 问题是,如果csv文件包含130行以上,ncav.py将冻结。当前的解决方案是手动破坏urls.csv(有时2000个公司)到130行的组。还有更好的主意吗?预先感谢。
python web-scraping beautifulsoup python-requests freeze
2个回答
0
投票

我相信您正在超负荷存储。第一步,我将使用linux中的top之类的命令来监视脚本运行时内存的运行方式。如果问题是内存不足,那么可以做得更好的一件事就是不要将结果附加在country_ncav列表中,而是通过逐行附加将结果直接写入文件中。


0
投票

以下代码在上述概念性解决方案之后成功完成。新的ncav.py:

© www.soinside.com 2019 - 2024. All rights reserved.