這篇已經過期失效, 新版的在這裡
氣象局網站的 地震活動彙整 列表, 檢視網頁內容, 真正資料網頁連結為 https://scweb.cwb.gov.tw/Page.aspx/?ItemId=20&loc=tw&adv=1, 因為是ASP網頁, 所以需先取出ASP的傳遞參數, 資料表格之 class 為 datalist4, 使用 BeautifulSoup 即可取出表格內容
# -*- coding: utf-8 -*- """ Created on Sun Oct 15 11:53:25 2018 @author: ghosty @Program: cwbweb_list.py @Prupose: download quake data from CWB web source: https://www.cwb.gov.tw/V7/earthquake/rtd_eq.htm """
import requests from bs4 import BeautifulSoup #import dateutil
def downloadCWBweb(year,month): url = 'https://scweb.cwb.gov.tw/Page.aspx/?ItemId=20&loc=tw&adv=1' agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0' headers = {'Content-type': 'application/x-www-form-urlencoded', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': agent} payload = { '__VIEWSTATE':'', '__VIEWSTATEGENERATOR':'', '__VIEWSTATEENCRYPTED':'', '__EVENTVALIDATION':'', 'ctl03_ddlYear':'', 'ctl03_ddlMonth':'', 'ctl03_btnSearch':'' }
response1 = requests.post(url) if response1.status_code != requests.codes.ok: print("CWB requesr fail") return
soup = BeautifulSoup(response1.text, "lxml") payload['__VIEWSTATE']=soup.find('input',id='__VIEWSTATE')['value'] payload['__VIEWSTATEGENERATOR']=soup.find('input',id='__VIEWSTATEGENERATOR')['value'] payload['__VIEWSTATEENCRYPTE']=soup.find('input',id='__VIEWSTATEENCRYPTED')['value'] payload['__EVENTVALIDATION']=soup.find('input',id='__EVENTVALIDATION')['value'] payload['ctl03$ddlYear']="{:4d}".format(year) payload['ctl03$ddlMonth']="{:0>2d}".format(month) payload['ctl03$btnSearch']=''
response2 = requests.post(url,data=payload, headers=headers) soup2 = BeautifulSoup(response2.text, "lxml") table = soup2.find('table', attrs={'class':'datalist4'}) rows = table.find_all('tr')
quakeData = [] for row in rows: cols = row.find_all('td') cols = [item.text.strip() for item in cols] if (len(cols)>0): #skip empty row quakeData.append([item for item in cols if item]) # Get rid of empty values
return quakeData
quakeData = downloadCWBweb(2018,8) for data in quakeData: print(data) |
留言列表