在for循环中使用正则表达式抓取天气数据

问题描述 投票:-1回答:1

[尝试从现在开始刮擦大气压12小时。不知道如何修改我的正则表达式以捕获在“ td”内两个单独的类中找到的分钟和小时。我尝试将class =“ ng-star-inserted”属性添加到find_all并搜索小时,但是失败了。变量V只是使用bs4解析的html的片段。一天中的每个小时大约有两个V形行。

from urllib.request import urlopen as uReq
import numpy as np
import cv2
from bs4 import BeautifulSoup as soup
import re
import datetime
import os
import csv
import cefpython3 as cef
import sys
import selenium
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

v = (
<tr _ngcontent-app-root-c5="" class="mat-header-row ng-star-inserted" mat-header-row="" role="row">
 <!-- -->
 <th _ngcontent-app-root-c5="" class="mat-header-cell cdk-column-timeHour mat-column-timeHour ng-star-inserted" mat-header-cell="" role="columnheader">
  Time
 </th>
 <th _ngcontent-app-root-c5="" class="mat-header-cell cdk-column-conditions mat-column-conditions ng-star-inserted" mat-header-cell="" role="columnheader">
  Conditions
 </th>
 <th _ngcontent-app-root-c5="" class="mat-header-cell cdk-column-temperature mat-column-temperature ng-star-inserted" mat-header-cell="" role="columnheader">
  Temp.
 </th>
 <th _ngcontent-app-root-c5="" class="mat-header-cell cdk-column-feelsLike mat-column-feelsLike ng-star-inserted" mat-header-cell="" role="columnheader">
  Feels Like
 </th>
 <th _ngcontent-app-root-c5="" class="mat-header-cell cdk-column-precipitation mat-column-precipitation ng-star-inserted" mat-header-cell="" role="columnheader">
  Precip
 </th>
 <th _ngcontent-app-root-c5="" class="mat-header-cell cdk-column-liquidPrecipitation mat-column-liquidPrecipitation ng-star-inserted" mat-header-cell="" role="columnheader">
  Amount
 </th>
 <th _ngcontent-app-root-c5="" class="mat-header-cell cdk-column-cloudCover mat-column-cloudCover ng-star-inserted" mat-header-cell="" role="columnheader">
  Cloud Cover
 </th>
 <th _ngcontent-app-root-c5="" class="mat-header-cell cdk-column-dewPoint mat-column-dewPoint ng-star-inserted" mat-header-cell="" role="columnheader">
  Dew Point
 </th>
 <th _ngcontent-app-root-c5="" class="mat-header-cell cdk-column-humidity mat-column-humidity ng-star-inserted" mat-header-cell="" role="columnheader">
  Humidity
 </th>
 <th _ngcontent-app-root-c5="" class="mat-header-cell cdk-column-wind mat-column-wind ng-star-inserted" mat-header-cell="" role="columnheader">
  Wind
 </th>
 <th _ngcontent-app-root-c5="" class="mat-header-cell cdk-column-pressure mat-column-pressure ng-star-inserted" mat-header-cell="" role="columnheader">
  Pressure
 </th>
</tr>
<tr _ngcontent-app-root-c5="" class="mat-row ng-star-inserted" mat-row="" role="row">
 <!-- -->
 <td _ngcontent-app-root-c5="" class="mat-cell cdk-column-timeHour mat-column-timeHour ng-star-inserted" mat-cell="" role="gridcell">
  <!-- -->
  <span _ngcontent-app-root-c5="" class="ng-star-inserted">
   12
   <span _ngcontent-app-root-c5="" class="show-for-medium">
    :00
   </span>
   am
  </span>
  <!-- -->
  <!-- -->
  <!-- -->
 </td>
 <td _ngcontent-app-root-c5="" class="mat-cell cdk-column-conditions mat-column-conditions ng-star-inserted" mat-cell="" role="gridcell">
  <!-- -->
  <!-- -->
  <!-- -->
  <!-- -->
  <span _ngcontent-app-root-c5="" class="ng-star-inserted">
   <img _ngcontent-app-root-c5="" alt="Partly Cloudy" class="no-scale" src="//www.wunderground.com/static/i/c/v4/29.svg"/>
   <span _ngcontent-app-root-c5="" class="show-for-medium conditions">
    Partly Cloudy
   </span>
   <span _ngcontent-app-root-c5="" class="show-for-small-only conditions">
   </span>
  </span>
 </td>
 <td _ngcontent-app-root-c5="" class="mat-cell cdk-column-temperature mat-column-temperature ng-star-inserted" mat-cell="" role="gridcell">
  <!-- -->
  <!-- -->
  <!-- -->
  <lib-display-unit _ngcontent-app-root-c5="" _nghost-app-root-c15="" class="ng-star-inserted">
   <!-- -->
   <span _ngcontent-app-root-c15="" class="test-true wu-unit wu-unit-temperature is-degree-visible ng-star-inserted">
    <!-- -->
    <!-- -->
    <!-- -->
    <span _ngcontent-app-root-c15="" class="wu-value wu-value-to">
     49
    </span>
    <span _ngcontent-app-root-c15="" class="wu-label">
     <!-- -->
     <span _ngcontent-app-root-c15="" class="ng-star-inserted">
      F
     </span>
     <!-- -->
    </span>
    <!-- -->
   </span>
   <!-- -->
  </lib-display-unit>
  <!-- -->
 </td>
 <td _ngcontent-app-root-c5="" class="mat-cell cdk-column-feelsLike mat-column-feelsLike ng-star-inserted" mat-cell="" role="gridcell">
  <!-- -->
  <!-- -->
  <!-- -->
  <lib-display-unit _ngcontent-app-root-c5="" _nghost-app-root-c15="" class="ng-star-inserted">
   <!-- -->
   <span _ngcontent-app-root-c15="" class="test-true wu-unit wu-unit-temperature is-degree-visible ng-star-inserted">
    <!-- -->
    <!-- -->
    <!-- -->
    <span _ngcontent-app-root-c15="" class="wu-value wu-value-to">
     49
    </span>
    <span _ngcontent-app-root-c15="" class="wu-label">
     <!-- -->
     <span _ngcontent-app-root-c15="" class="ng-star-inserted">
      F
     </span>
     <!-- -->
    </span>
    <!-- -->
   </span>
   <!-- -->
  </lib-display-unit>
  <!-- -->
 </td>
 <td _ngcontent-app-root-c5="" class="mat-cell cdk-column-precipitation mat-column-precipitation ng-star-inserted" mat-cell="" role="gridcell">
  <!-- -->
  <!-- -->
  <a _ngcontent-app-root-c5="" class="ng-star-inserted" href="/precipitation/us/or/portland">
   <!-- -->
   <lib-display-unit _ngcontent-app-root-c5="" _nghost-app-root-c15="" class="ng-star-inserted">
    <!-- -->
    <span _ngcontent-app-root-c15="" class="test- wu-unit wu-unit-chance ng-star-inserted">
     <!-- -->
     <!-- -->
     <!-- -->
     <span _ngcontent-app-root-c15="" class="wu-value wu-value-to">
      0
     </span>
     <span _ngcontent-app-root-c15="" class="wu-label">
      <!-- -->
      <span _ngcontent-app-root-c15="" class="ng-star-inserted">
       %
      </span>
      <!-- -->
     </span>
     <!-- -->
    </span>
    <!-- -->
   </lib-display-unit>
  </a>
  <!-- -->
  <!-- -->
 </td>
 <td _ngcontent-app-root-c5="" class="mat-cell cdk-column-liquidPrecipitation mat-column-liquidPrecipitation ng-star-inserted" mat-cell="" role="gridcell">
  <!-- -->
  <!-- -->
  <a _ngcontent-app-root-c5="" class="ng-star-inserted" href="/precipitation/us/or/portland">
   <!-- -->
   <lib-display-unit _ngcontent-app-root-c5="" _nghost-app-root-c15="" class="ng-star-inserted">
    <!-- -->
    <span _ngcontent-app-root-c15="" class="test- wu-unit wu-unit-rain ng-star-inserted">
     <!-- -->
     <!-- -->
     <!-- -->
     <span _ngcontent-app-root-c15="" class="wu-value wu-value-to">
      0
     </span>
     <span _ngcontent-app-root-c15="" class="wu-label">
      <!-- -->
      <span _ngcontent-app-root-c15="" class="ng-star-inserted">
       in
      </span>
      <!-- -->
     </span>
     <!-- -->
    </span>
    <!-- -->
   </lib-display-unit>
  </a>
  <!-- -->
  <!-- -->
 </td>
 <td _ngcontent-app-root-c5="" class="mat-cell cdk-column-cloudCover mat-column-cloudCover ng-star-inserted" mat-cell="" role="gridcell">
  <!-- -->
  <!-- -->
  <!-- -->
  <lib-display-unit _ngcontent-app-root-c5="" _nghost-app-root-c15="" class="ng-star-inserted">
   <!-- -->
   <span _ngcontent-app-root-c15="" class="test- wu-unit wu-unit-chance ng-star-inserted">
    <!-- -->
    <!-- -->
    <!-- -->
    <span _ngcontent-app-root-c15="" class="wu-value wu-value-to">
     31
    </span>
    <span _ngcontent-app-root-c15="" class="wu-label">
     <!-- -->
     <span _ngcontent-app-root-c15="" class="ng-star-inserted">
      %
     </span>
     <!-- -->
    </span>
    <!-- -->
   </span>
   <!-- -->
  </lib-display-unit>
  <!-- -->
 </td>
 <td _ngcontent-app-root-c5="" class="mat-cell cdk-column-dewPoint mat-column-dewPoint ng-star-inserted" mat-cell="" role="gridcell">
  <!-- -->
  <!-- -->
  <!-- -->
  <lib-display-unit _ngcontent-app-root-c5="" _nghost-app-root-c15="" class="ng-star-inserted">
   <!-- -->
   <span _ngcontent-app-root-c15="" class="test-true wu-unit wu-unit-temperature is-degree-visible ng-star-inserted">
    <!-- -->
    <!-- -->
    <!-- -->
    <span _ngcontent-app-root-c15="" class="wu-value wu-value-to">
     44
    </span>
    <span _ngcontent-app-root-c15="" class="wu-label">
     <!-- -->
     <span _ngcontent-app-root-c15="" class="ng-star-inserted">
      F
     </span>
     <!-- -->
    </span>
    <!-- -->
   </span>
   <!-- -->
  </lib-display-unit>
  <!-- -->
 </td>
 <td _ngcontent-app-root-c5="" class="mat-cell cdk-column-humidity mat-column-humidity ng-star-inserted" mat-cell="" role="gridcell">
  <!-- -->
  <!-- -->
  <!-- -->
  <lib-display-unit _ngcontent-app-root-c5="" _nghost-app-root-c15="" class="ng-star-inserted">
   <!-- -->
   <span _ngcontent-app-root-c15="" class="test- wu-unit wu-unit-humidity ng-star-inserted">
    <!-- -->
    <!-- -->
    <!-- -->
    <span _ngcontent-app-root-c15="" class="wu-value wu-value-to">
     81
    </span>
    <span _ngcontent-app-root-c15="" class="wu-label">
     <!-- -->
     <span _ngcontent-app-root-c15="" class="ng-star-inserted">
      %
     </span>
     <!-- -->
    </span>
    <!-- -->
   </span>
   <!-- -->
  </lib-display-unit>
  <!-- -->
 </td>
 <td _ngcontent-app-root-c5="" class="mat-cell cdk-column-wind mat-column-wind ng-star-inserted" mat-cell="" role="gridcell">
  <!-- -->
  <!-- -->
  <!-- -->
  <lib-display-unit _ngcontent-app-root-c5="" _nghost-app-root-c15="" class="ng-star-inserted">
   <!-- -->
   <span _ngcontent-app-root-c15="" class="test- wu-unit wu-unit-speed ng-star-inserted">
    <!-- -->
    <!-- -->
    <!-- -->
    <span _ngcontent-app-root-c15="" class="wu-value wu-value-to">
     2
    </span>
    <span _ngcontent-app-root-c15="" class="wu-label">
     <!-- -->
     <span _ngcontent-app-root-c15="" class="ng-star-inserted">
      mph
     </span>
     <!-- -->
    </span>
    <!-- -->
    <span _ngcontent-app-root-c15="" class="wu-suffix ng-star-inserted">
     NW
    </span>
   </span>
   <!-- -->
  </lib-display-unit>
  <!-- -->
 </td>
 <td _ngcontent-app-root-c5="" class="mat-cell cdk-column-pressure mat-column-pressure ng-star-inserted" mat-cell="" role="gridcell">
  <!-- -->
  <!-- -->
  <!-- -->
  <lib-display-unit _ngcontent-app-root-c5="" _nghost-app-root-c15="" class="ng-star-inserted">
   <!-- -->
   <span _ngcontent-app-root-c15="" class="test- wu-unit wu-unit-pressure ng-star-inserted">
    <!-- -->
    <!-- -->
    <!-- -->
    <span _ngcontent-app-root-c15="" class="wu-value wu-value-to">
     30.05
    </span>
    <span _ngcontent-app-root-c15="" class="wu-label">
     <!-- -->
     <span _ngcontent-app-root-c15="" class="ng-star-inserted">
      in
     </span>
     <!-- -->
    </span>
    <!-- -->
   </span>
   <!-- -->
  </lib-display-unit>
  <!-- -->
 </td>
</tr>
)

time_12 = '8:27'

userInputDt = datetime.datetime.strptime(time_12, '%H:%M')

for row in v:
    m = re.findall('(\d+:\d+)\s+([AP]M)', row.get_text())
    if len(m) > 0:
        dtString = ' '.join(map(str,m[0]))
        dt = datetime.datetime.strptime(dtString, '%I:%M %p')
        timedelta = abs(dt - userInputDt)
        if time_delta == None or timedelta < time_delta:
            save_row = row
            time_delta = timedelta
z = []

for td in save_row.find_all('td'):
        z.append(td.get_text())

输出

Traceback (most recent call last):
  File "C:\Python\Scripts\test2.py", line 57, in <module>
    for td in save_row.find_all('td'):
AttributeError: 'NoneType' object has no attribute 'find_all'

脚本失败于

m = re.findall('(\d+:\d+)\s+([AP]M)', row.get_text())

我相信我需要将html的小时和分钟合并为一个字符串,然后才能使用正则表达式,但不确定如何继续。

timeHour ng-star-inserted" mat-cell="" role="gridcell">
  <!-- -->
  <span _ngcontent-app-root-c5="" class="ng-star-inserted">
   12
   <span _ngcontent-app-root-c5="" class="show-for-medium">
    :00
   </span>

实际源代码

from urllib.request import urlopen as uReq
import numpy as np
import cv2
from bs4 import BeautifulSoup as soup
import re
import datetime
import os
import csv
import cefpython3 as cef
import sys
import selenium
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

now = datetime.datetime.now()
now_plus_12 = now + datetime.timedelta(hours = 12)
future_12 = (now_plus_12.timetuple())
fTup12 = str(future_12[0]),str(future_12[1]),str(future_12[2]),str(future_12[3]),str(future_12[4])
fList12 = list(fTup12)
year = fList12[0]
month = fList12[1]
day = fList12[2]
timeTup = str(fList12[3]),str(fList12[4])
timeList = list(timeTup)
time_12 = ':'.join(timeList)

opts = webdriver.ChromeOptions()
opts.add_argument('headless')

driver = webdriver.Chrome(chrome_options=opts)
driver.maximize_window()

weather_url = 'https://www.wunderground.com/hourly/us/or/portland/date/' + year + '-' + month + '-' + day
driver.get(weather_url)
time.sleep(20)
weather_html = driver.page_source
weather_soup = soup(weather_html, "html.parser")
table = weather_soup.find('table', id="hourly-forecast-table")

save_row = None
time_delta = None

#userInputDt = datetime.datetime.strptime(time_12, '%H:%M')

#for row in table.find_all('tr'):
 #   m = re.findall('(\d+:\d+)\s+([AP]M)', row.get_text())
  #  if len(m) > 0:
   #     dtString = ' '.join(map(str,m[0]))
    #    dt = datetime.datetime.strptime(dtString, '%I:%M %p')
     #   timedelta = abs(dt - userInputDt)
      #  if time_delta == None or timedelta < time_delta:
       #     save_row = row
        #    time_delta = timedelta
#v = []

#for td in save_row.find_all('td'):
 #       v.append(td.get_text())
python regex web-scraping
1个回答
0
投票

[发现HTML源从大写AM / PM变为小写am / pm

更改

m = re.findall('(\d+:\d+)\s+([AP]M)', row.get_text())

to

m = re.findall('(\d+:\d+)\s+([ap]m)', row.get_text())
© www.soinside.com 2019 - 2024. All rights reserved.