python3爬虫新浪微博关键词爬取特定内容特地时间(自定义什么时候的时间)等,有注释(日爬20000小问题)
we搜索爬取内容时间可以自定义添加自己要爬的内容,如视频图片等功能强大自定义需要的时间段,内容的关键字,高效from selenium import webdriverfrom lxml import etreefrom urllib import parsefrom time import sleepimport datetimefrom xlutils.copy import copyimpo
·
微博搜索爬取内容时间
可以自定义添加自己要爬的内容,如视频图片等
功能强大
自定义需要的时间段,内容的关键字,高效
from selenium import webdriver
from lxml import etree
from urllib import parse
from time import sleep
import datetime
from xlutils.copy import copy
import xlrd
import time
import re
keyword = '爬虫' # 爬取的关键词
y = 2021 # 起始年
m = 4 # 起始月
d = 1 # 起始日
days = 10 # 爬days天
url_keyword = parse.quote(keyword) # 将关键词转换成为网址可识别
def getday(y, m, d, n): # 封装日期
the_date = datetime.datetime(y, m, d)
result_date = the_date + datetime.timedelta(days=n)
d = result_date.strftime('%Y-%m-%d')
return d
def p(days, x): # 爬取解析存储
for i in range(days):
data = getday(y, m, d, +i)
for j in range(24): # 获取24小时的网址
if j == 23:
data_add_hour = data + '-' + str(j) + ':' + getday(y, m, d, -(i - 1)) + '-' + str(0)
else:
data_add_hour = data + '-' + str(j) + ':' + data + '-' + str(j + 1)
# selenium
bro = webdriver.Chrome(executable_path=r'D:\python\chorm\chromedriver.exe')
url = 'https://s.weibo.com/weibo?q=' + url_keyword + '&typeall=1&suball=1×cope=custom:' + data_add_hour
print(url)
bro.get(url)
sleep(2) # 等待完整加载
page_text = bro.page_source # 完整页面
print(page_text)
sleep(1)
bro.quit() # 关闭网页
# 开始解析
tree = etree.HTML(page_text)
print(tree)
wb_list = tree.xpath("//div[@class='card-feed']")
for li in wb_list:
wb_time = li.xpath("./div[2]/p[3]/a[1]/text()|./div[2]/p[2]/a[1]/text()")
time_re = '[0-9][0-9]月[0-9][0-9]日 [0-9][0-9]:[0-9][0-9]'
rst = re.compile(time_re).findall(str(wb_time))
wb_name = li.xpath("./div[2]/div[1]/div[2]/a[1]/text()")
print(wb_name)
wb_text = li.xpath("./div[2]/p[1]//text()")
print(wb_text)
wb_from = li.xpath("./div[2]/p[@class='from']/a[2]/text()")
wb_href = li.xpath("./div[2]/p[@class='from']/a[1]/@href")
print(wb_href)
rb = xlrd.open_workbook('wb_py.xls') # 打开文件
wb = copy(rb) # 利用xlutils.copy下的copy函数复制
ws = wb.get_sheet(0) # 获取表单0
ws.write(x, 1, wb_name)
ws.write(x, 2, wb_href)
ws.write(x, 3, wb_text)
ws.write(x, 4, wb_time)
# print(wb_time)
ws.write(x, 5, wb_from)
x = x + 1
print(x)
wb.save('wb_py.xls') # 保存文件
if __name__ == '__main__':
p(days, 1)
```交流加 a1953233122
回复比较及时,记得备注
更多推荐
所有评论(0)