# need instll html5lib from bs4 import BeautifulSoup soup = BeautifulSoup(dom, 'html5lib')
html 解析
get data
1 2 3 4 5 6 7 8 9
print(soup.title) # <title>Angela's Personal Site</title> print(soup.title.name) # show tag: title print(soup.title.string) # Angela's Personal Site print(soup.prettify()) # 美化排列
# find 1st tag print(soup.a) # 1st tag a print(soup.p) # 1st tag p print(soup.li) # 1st tag li
# find all tag links = soup.find_all('a') print(links) for link in links: # all accept # print(link.get_text()) # print(link.getText()) # print(link.text) # get link print(link.get('href'))
# find tag add condition print(soup.find("h1", id='name')) # class is keyword, change to class_ print(soup.find("h3", class_='heading'))
# find_all() divs = soup.find_all('div', 'r-ent') for d in divs: .....
# find tag print(soup.h4) print(soup.find("h4")) # find tag's text print(soup.find("h4").text) # find tag's text in below tag print(soup.h4.a.text) # find all tags's text for h4 in h4_tags: print(h4.a.text) print(soup.h4.a.text)
# find all tags's text by class # 以下代表相同 # h4_tags = soup.find_all('h4', class_='card-title') # h4_tags = soup.find_all('h4', 'card-title') # h4_tags = soup.find_all('h4', {'class' : 'card-title'}) h4_tags = soup.find_all('h4', 'card-title') for h4 in h4_tags: print(h4.a.text) # find tags's text by id print('soup.find(id="mac-p").text.strip() : ') print('-'+soup.find(id='mac-p').text.strip()+'-')
# find all tags's text by 非標準屬性 # 非標準屬性會有錯誤 # print(soup.find(data-foo='mac-p').text.strip()) print('===============================') # 改用以下方式 print(soup.find_all('', {'data-foo': 'mac-foo'})) # find blog 內容 divs = soup.find_all('div', 'content') for div in divs: print(div.h6.text.strip(), div.h4.text.strip(), div.p.text.strip()) # find blog 內容 - another way for div in divs: print([s for s in div.stripped_strings])
# find li iclude a tag next = soup.find('li', {'class': 'nexttxt'}).find('a') # get tag a's href print('next url = ' + next['href'])
select
1 2 3 4 5
# select 1st print(soup.select_one("p a")) print(soup.select_one("#name")) # select all print(soup.select(".heading"))
separent/sibling/children
1 2 3 4
# parent and sibling price = link.parent.previous_sibling.text # children all_tds = [td for td in row.children]
from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By
defpage_down(element, times, sec): print("[%] Scrolling down.") for i inrange(times): print(i) element.send_keys(Keys.PAGE_DOWN) sleep(sec) # bot id protection
# save post - it check get post already post_list = list()
if __name__ == '__main__': browser = webdriver.Chrome() browser.get('https://www.dcard.tw/f') sleep(2)
# element for press page down element = browser.find_element(By.TAG_NAME, "body")
# loop break when get >= 10 post count=0 for count inrange(10): page_down(element, 2, 0.5) html = browser.page_source soup = BeautifulSoup(browser.page_source, 'html.parser') find_top10_hot_title(soup, post_list, 'post-*') print(post_list) if (len(post_list)>=10): break
# exit browser browser.quit()
example 2 - get html from element
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
news = browser.find_elements(By.CLASS_NAME, "story-list__news") i = 1 for new in news: new_html = new.get_attribute('innerHTML') soup = BeautifulSoup(new_html, 'html5lib') title = soup.find('div', {'class' : 'story-list__text'}).find('a').text time_tag = soup.find('div', {'class' : 'story-list__info'}) if time_tag: time_tag = time_tag.text.strip().split('\n')[1].strip() else: time_tag = "" # check no title mean not exist iflen(title) != 0: print(f'{i:02d}{time_tag:16s}{title}') i += 1
# Convert indentation to spaces press CTRL + Shift + P type: "convert indentation to Space" # This one forces the tab to be **space** "editor.insertSpaces": true, # 修剪行末空白 "files.trimTrailingWhitespace": true,