import requests,csv from bs4 import BeautifulSoup def getHtml(url): # 数据采集 h = { 'User-Agent': 'Mozilla / 5.0(Windows NT 10.0;WOW64)' } response = requests.get(url,headers = h) html = response.text # print(html) # getHtml('https://movie.douban.com/top250') # 数据解析:正则,BeautifulSoup,Xpath soup = BeautifulSoup(html,'lxml') filmtitle = soup.select('div.hd > a > span:nth-child(1)') ct = soup.select('div.bd > p:nth-child(1)') score = soup.select('div.bd > div > span.rating_num') evalue = soup.select('div.bd > div > span:nth-child(4)') print(score) filmlist = [] for t,c,s,e in zip(filmtitle,ct,score,evalue): title = t.text content = c.text filmscore = s.text num = e.text.strip('人评价') director = content.strip().split()[1] if "主演:" in content: actor = content.strip().split('主演:')[1].split()[0] else: actor = None year = content.strip().split('/')[-3].split()[-1] area = content.strip().split('/')[-2].strip() filmtype = content.strip().split('/')[-1].strip() # print(num) listdata = [title,director,actor,year,area,filmtype,filmscore,num] filmlist.append(listdata) print(filmlist) # 存储数据 with open('douban250.csv','a',encoding='utf-8',newline='') as f: w = csv.writer(f) w.writerows(filmlist) # 函数调用 listtitle = ['title','director','actor','year','area','type','score','evalueate'] with open('douban250.csv','a',encoding='utf-8',newline='') as f: w = csv.writer(f) w.writerow(listtitle) for i in range(0,226,25): getHtml('https://movie.douban.com/top250?start=%s&filter='%(i)) # getHtml('https://movie.douban.com/top250?start=150&filter=')