去年氣象局網頁改版為響應式網頁後, 這篇爬蟲已經過期, 一直沒時間更新, 這次使用 python + selenium + beautifulsoup, 使用 webdriver之chromedriver, 讓chrome選擇UI設定後, 從網頁內容爬取資料. 程式先將每個月的彙整表拉出來, 因為新版彙整表資訊不完整, 缺少經緯度等資料, 所以須開啟每一個地震的詳細資料頁, 重新爬一次

# -*- coding: utf-8 -*-

"""

Created on Sun May 23 14:05:07 2021

 

@author: ghosty

"""

 

from selenium import webdriver

from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.chrome.options import Options

from selenium.webdriver.common.keys import Keys

from selenium.webdriver.support.ui import Select

from bs4 import BeautifulSoup

import time

         

#constant

cwbUrl = 'https://scweb.cwb.gov.tw'

 

# Create Chrome Page

options = Options()

options.add_argument("--disable-notifications")  

chrome = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options )

 

def getQuakeUrlList(year, month):

    #open CWB web

    chrome.get(cwbUrl+'/zh-tw/earthquake/data/')

    # setup search Month

    xpath='/html/body/div[1]/div[2]/div/div[2]/div[1]/div[1]/div/h2/input[1]' #xpath of quakelist table

    searchMonth = chrome.find_element_by_xpath(xpath)

    chrome.execute_script('arguments[0].removeAttribute(\"readonly\")', searchMonth)

    chrome.find_element_by_xpath(xpath).clear()

    searchMonth.send_keys(str(year)+''+str(month)+'')

    searchMonth.send_keys(Keys.RETURN)

 

    # setup list opion to 'All

    xpath='/html/body/div[1]/div[2]/div/div[2]/div[1]/div[4]/div/div[1]/div/div[2]/div/label/select' #option xpath

    tableLength=chrome.find_element_by_xpath(xpath)

    Select(tableLength).select_by_index(5) #loption is 'All'

 

    # get html contain

    time.sleep(3)

    soup = BeautifulSoup(chrome.page_source, 'html.parser')

 

    quakeList = []

    quakeTable = soup.find_all('table')[1].find_all('tr')

    for row in quakeTable[1:]:

        col = row.find_all('td')

        quakeList.append(cwbUrl+col[6].find('a')['href'])

        print(col[0].text,col[1].text, col[2].text, col[3].text, col[4].text, col[5].text,\

              '-',col[6].text[:19], col[6].find('a')['href'], col[6].find('a').text.replace(' ', ''))

    return quakeList

 

def getQuakeInfo(year, month):

    quakeUrlList = getQuakeUrlList(year, month)

    for quakeUrl in quakeUrlList:

        chrome.get(quakeUrl)

        time.sleep(2)               

        soup = BeautifulSoup(chrome.page_source, 'html.parser')

        #xpath='/html/body/div[1]/div[2]/div/div[1]/div[2]/div[3]/ul' # info list

        #infoList = chrome.find_element_by_xpath(xpath)

        infoList = soup.find_all('li')

        for i in range(len(infoList)):

            #if any(info.text in s for s in ['發震時間','震央位置','地震深度','芮氏規模','相對位置']):

            if '發震時間' in infoList[i].text:

                print('-----------------')

                if (len(infoList[i-1].text.replace('\n','').replace(' ',''))>0):

                    print(infoList[i-1].text.replace('\n','').replace(' ',''))

                else:

                    print(infoList[i-1].img['alt'])

                for j in range(5):

                    print(infoList[i+j].text.replace('\n','').replace(' ',''))

                reportImg = quakeUrl.replace('https://scweb.cwb.gov.tw/zh-tw/earthquake/details/','https://scweb.cwb.gov.tw/zh-tw/earthquake/imgs/')

                print('地震報告:'+reportImg)   

                break # exit loop i               

        

year = 2020

for month in range(3,13):  

    getQuakeInfo(year, month)

 

arrow
arrow
    文章標籤
    Python
    全站熱搜
    創作者介紹
    創作者 ghostyguo 的頭像
    ghostyguo

    No More Codes

    ghostyguo 發表在 痞客邦 留言(0) 人氣()