下面我将详细分步骤讲解如何使用Python批量爬取CNKI文献摘要,包括环境准备、代码实现和注意事项。

CSDN大礼包:《2025年最新全套学习资料包》免费分享

在这里插入图片描述

第一步:环境准备

1. 安装Python环境

确保已安装Python 3.6+版本

2. 安装所需库

pip install requests beautifulsoup4 fake_useragent

3. 准备工具

  • 文本编辑器或IDE(如VS Code、PyCharm)
  • 浏览器开发者工具(用于分析网页结构)

第二步:理解CNKI网页结构

  1. 访问CNKI官网(https://www.cnki.net/)
  2. 搜索一个关键词,如"人工智能"
  3. 打开浏览器开发者工具(F12)
  4. 分析:
    • 搜索请求的URL和参数
    • 结果页面的HTML结构
    • 详情页面的HTML结构

第三步:基础代码实现

1. 导入必要库

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time
import random
import json

2. 创建爬虫类

class CNKICrawler:
    def __init__(self):
        self.ua = UserAgent()  # 随机User-Agent
        self.session = requests.Session()
        self.headers = {
            'User-Agent': self.ua.random,
            'Referer': 'https://www.cnki.net/',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        }
        self.proxies = None  # 可以设置代理

3. 实现搜索功能

    def search(self, keyword, page=1):
        """
        搜索文献
        :param keyword: 搜索关键词
        :param page: 页码
        :return: 搜索结果列表
        """
        search_url = 'https://kns.cnki.net/kns8/defaultresult/index'
        params = {
            'kw': keyword,
            'korder': 'relevant',
            'kpage': str(page),
            'kspan': '10'  # 每页10条
        }
        
        try:
            response = self.session.get(
                search_url,
                params=params,
                headers=self.headers,
                proxies=self.proxies,
                timeout=15
            )
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            results = []
            
            # 解析搜索结果
            for item in soup.select('.result-table-list tbody tr'):
                try:
                    title_elem = item.select_one('a.result-table-title')
                    if not title_elem:
                        continue
                        
                    result = {
                        'title': title_elem.get_text(strip=True),
                        'url': title_elem['href'] if title_elem['href'].startswith('http') 
                              else f'https://kns.cnki.net{title_elem["href"]}',
                        'authors': item.select_one('.result-table-authors').get_text(strip=True) 
                                  if item.select_one('.result-table-authors') else '',
                        'source': item.select_one('.result-table-source').get_text(strip=True) 
                                 if item.select_one('.result-table-source') else '',
                        'date': item.select_one('.result-table-date').get_text(strip=True) 
                               if item.select_one('.result-table-date') else ''
                    }
                    results.append(result)
                except Exception as e:
                    print(f"解析结果项出错: {str(e)}")
                    continue
            
            return results
            
        except Exception as e:
            print(f"搜索出错: {str(e)}")
            return None

4. 实现获取摘要功能

    def get_abstract(self, detail_url):
        """
        获取文献摘要
        :param detail_url: 文献详情页URL
        :return: 摘要文本
        """
        try:
            response = self.session.get(
                detail_url,
                headers=self.headers,
                proxies=self.proxies,
                timeout=15
            )
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            abstract_div = soup.find('div', class_='abstract-content')
            if abstract_div:
                return abstract_div.get_text(strip=True)
            return None
            
        except Exception as e:
            print(f"获取摘要出错: {str(e)}")
            return None

5. 实现批量爬取

    def batch_crawl(self, keywords, max_per_keyword=3, max_pages=1):
        """
        批量爬取多个关键词的摘要
        :param keywords: 关键词列表
        :param max_per_keyword: 每个关键词最多爬取的文献数
        :param max_pages: 每个关键词最多搜索的页数
        :return: 爬取结果字典
        """
        results = {}
        
        for keyword in keywords:
            print(f"\n正在处理关键词: {keyword}")
            keyword_results = []
            count = 0
            
            for page in range(1, max_pages + 1):
                if count >= max_per_keyword:
                    break
                    
                print(f"正在搜索第 {page} 页...")
                literatures = self.search(keyword, page)
                if not literatures:
                    break
                    
                for lit in literatures:
                    if count >= max_per_keyword:
                        break
                        
                    print(f"正在获取 '{lit['title'][:30]}...' 的摘要...")
                    abstract = self.get_abstract(lit['url'])
                    if abstract:
                        lit['abstract'] = abstract
                        keyword_results.append(lit)
                        count += 1
                    
                    # 随机延迟
                    time.sleep(random.uniform(2, 5))
            
            results[keyword] = keyword_results
            # 每个关键词处理完后增加更长延迟
            time.sleep(random.uniform(5, 10))
        
        return results

第四步:使用爬虫

1. 初始化并运行爬虫

if __name__ == '__main__':
    crawler = CNKICrawler()
    
    # 设置代理(可选)
    # crawler.proxies = {
    #     'http': 'http://your_proxy:port',
    #     'https': 'https://your_proxy:port'
    # }
    
    # 关键词列表
    keywords = ["人工智能", "机器学习", "深度学习", "自然语言处理"]
    
    # 爬取摘要(每个关键词最多3篇,每关键词搜索1页)
    results = crawler.batch_crawl(keywords, max_per_keyword=3, max_pages=1)
    
    # 保存结果
    with open('cnki_abstracts.json', 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    
    print("\n爬取完成,结果已保存到 cnki_abstracts.json")

第五步:处理反爬机制

1. 随机User-Agent

已在代码中使用fake_useragent库实现

2. 请求延迟

代码中已包含随机延迟:

time.sleep(random.uniform(2, 5))  # 短延迟
time.sleep(random.uniform(5, 10))  # 长延迟

3. 使用代理(可选)

取消注释以下代码并设置你的代理:

# crawler.proxies = {
#     'http': 'http://your_proxy:port',
#     'https': 'https://your_proxy:port'
# }

第六步:结果处理与分析

爬取完成后,你可以加载结果进行分析:

import json

# 加载结果
with open('cnki_abstracts.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 打印结果
for keyword, articles in data.items():
    print(f"\n关键词: {keyword}")
    for article in articles:
        print(f"\n标题: {article['title']}")
        print(f"作者: {article['authors']}")
        print(f"来源: {article['source']}")
        print(f"日期: {article['date']}")
        print(f"摘要: {article['abstract'][:200]}...")

注意事项

  1. 合法性:确保你的爬取行为符合CNKI的服务条款和相关法律法规
  2. 频率控制:不要过于频繁地请求,避免被封IP
  3. 数据使用:仅将数据用于个人学习研究,不得用于商业用途
  4. 页面结构变化:CNKI可能会更改页面结构,需要定期更新解析逻辑
  5. 验证码处理:如果出现验证码,需要额外实现验证码识别功能

完整代码

以下是完整的可执行代码:

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time
import random
import json

class CNKICrawler:
    def __init__(self):
        self.ua = UserAgent()
        self.session = requests.Session()
        self.headers = {
            'User-Agent': self.ua.random,
            'Referer': 'https://www.cnki.net/',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        }
        self.proxies = None

    def search(self, keyword, page=1):
        search_url = 'https://kns.cnki.net/kns8/defaultresult/index'
        params = {
            'kw': keyword,
            'korder': 'relevant',
            'kpage': str(page),
            'kspan': '10'
        }
        
        try:
            response = self.session.get(
                search_url,
                params=params,
                headers=self.headers,
                proxies=self.proxies,
                timeout=15
            )
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            results = []
            
            for item in soup.select('.result-table-list tbody tr'):
                try:
                    title_elem = item.select_one('a.result-table-title')
                    if not title_elem:
                        continue
                        
                    result = {
                        'title': title_elem.get_text(strip=True),
                        'url': title_elem['href'] if title_elem['href'].startswith('http') 
                              else f'https://kns.cnki.net{title_elem["href"]}',
                        'authors': item.select_one('.result-table-authors').get_text(strip=True) 
                                  if item.select_one('.result-table-authors') else '',
                        'source': item.select_one('.result-table-source').get_text(strip=True) 
                                 if item.select_one('.result-table-source') else '',
                        'date': item.select_one('.result-table-date').get_text(strip=True) 
                               if item.select_one('.result-table-date') else ''
                    }
                    results.append(result)
                except Exception as e:
                    print(f"解析结果项出错: {str(e)}")
                    continue
            
            return results
            
        except Exception as e:
            print(f"搜索出错: {str(e)}")
            return None

    def get_abstract(self, detail_url):
        try:
            response = self.session.get(
                detail_url,
                headers=self.headers,
                proxies=self.proxies,
                timeout=15
            )
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            abstract_div = soup.find('div', class_='abstract-content')
            if abstract_div:
                return abstract_div.get_text(strip=True)
            return None
            
        except Exception as e:
            print(f"获取摘要出错: {str(e)}")
            return None

    def batch_crawl(self, keywords, max_per_keyword=3, max_pages=1):
        results = {}
        
        for keyword in keywords:
            print(f"\n正在处理关键词: {keyword}")
            keyword_results = []
            count = 0
            
            for page in range(1, max_pages + 1):
                if count >= max_per_keyword:
                    break
                    
                print(f"正在搜索第 {page} 页...")
                literatures = self.search(keyword, page)
                if not literatures:
                    break
                    
                for lit in literatures:
                    if count >= max_per_keyword:
                        break
                        
                    print(f"正在获取 '{lit['title'][:30]}...' 的摘要...")
                    abstract = self.get_abstract(lit['url'])
                    if abstract:
                        lit['abstract'] = abstract
                        keyword_results.append(lit)
                        count += 1
                    
                    time.sleep(random.uniform(2, 5))
            
            results[keyword] = keyword_results
            time.sleep(random.uniform(5, 10))
        
        return results

if __name__ == '__main__':
    crawler = CNKICrawler()
    
    # 设置代理(可选)
    # crawler.proxies = {
    #     'http': 'http://your_proxy:port',
    #     'https': 'https://your_proxy:port'
    # }
    
    keywords = ["人工智能", "机器学习", "深度学习", "自然语言处理"]
    results = crawler.batch_crawl(keywords, max_per_keyword=3, max_pages=1)
    
    with open('cnki_abstracts.json', 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    
    print("\n爬取完成,结果已保存到 cnki_abstracts.json")

总结

本文详细讲解了如何使用Python批量爬取CNKI文献摘要的完整流程,包括环境准备、代码实现、反爬处理和结果分析。请记住,在实际使用时务必遵守相关法律法规和网站规定,合理控制爬取频率。

Logo

技术共进,成长同行——讯飞AI开发者社区

更多推荐