爬虫爬取新闻实战：小白如何用Python迅速学会爬取千条新闻？

发布时间：2025-12-24 00:23:15 点击量：

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from urllib import error
from urllib import request
import os
import time
import sys
# default url 
# replace for yours
url = "https://www.google.com"
explorer = "Chrome"#使用谷歌浏览器
# 存储文件地址
imgs_dir = "images"
#搜索关键词
search_words="珈乐Carol"
# report hook with three parameters passed
# count_of_blocks  The number of blocks transferred
# block_size The size of block
# total_size Total size of the file
def progress_callback(count_of_blocks, block_size, total_size):
    # determine current progress
    progress = int(50 * (count_of_blocks * block_size) / total_size)
    if progress > 50:
        progress = 50
    # update progress bar
    sys.stdout.write("\r[%s%s] %d%%" % ('█' * progress, '  ' * (50 - progress), progress * 2))
    sys.stdout.flush()
class CrawlSelenium:
    def __init__(self, explorer="Chrome", url="https://www.google.com"):
        self.url = url
        self.explorer = explorer
    def set_loading_strategy(self, strategy="normal"):
        self.options = Options()
        self.options.page_load_strategy = strategy
    def crawl(self):
        # instantiate driver according to corresponding explorer
        if self.explorer == "Chrome":
            self.driver = webdriver.Chrome(options=self.options)
        if self.explorer == "Opera":
            self.driver = webdriver.Opera(options=self.options)
        if self.explorer == "Firefox":
            self.driver = webdriver.Firefox(options=self.options)
        if self.explorer == "Edge":
            self.driver = webdriver.Edge(options=self.options)
        # search on google
        # navigate to url
        self.driver.get(self.url)
        # locate input field
        search_input = self.driver.find_element(By.NAME, 'q')
        # emulate user input and enter to search
        webdriver.ActionChains(self.driver).move_to_element(search_input).send_keys(search_words + Keys.ENTER).perform()
        
        # navigate to google image

        # find navigation buttons
        self.driver.find_element(By.LINK_TEXT, '图片').click()
        # load more images as many as possible
        # scrolling to bottom
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        # get button
        show_more_button = self.driver.find_element(By.CSS_SELECTOR, "input[value='显示更多搜索结果']")
        try:
            while True:
                # do according to message
                message = self.driver.find_element(By.CSS_SELECTOR, 'div.OuJzKb.Bqq24e').get_attribute('textContent')
                # print(message)
                if message == '正在加载更多内容，请稍候':
                    self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
                elif message == '新内容已成功加载。向下滚动即可查看更多内容。':
                    # scrolling to bottom
                    self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
                    if show_more_button.is_displayed():
                        show_more_button.click()
                elif message == '看来您已经看完了所有内容':
                    break
                elif message == '无法加载更多内容，点击即可重试。':
                    show_more_button.click()
                else:
                    self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        except Exception as err:
            print(err)
        # find all image elements in google image result page
        imgs = self.driver.find_elements(By.CSS_SELECTOR, "img.rg_i.Q4LuWd")
        
        img_count = 0
        for img in imgs:
            try:
                # image per second
                time.sleep(1)
                print('\ndownloading image ' + str(img_count) + ': ')
                img_url = img.get_attribute("src")
                if img_url == None:
                    continue
                path = os.path.join(imgs_dir, str(img_count) + "_img.jpg")
                request.urlretrieve(url = img_url, filename = path, reporthook = progress_callback, data = None)
                img_count = img_count + 1
            except error.HTTPError as http_err:
                print(http_err)
            except Exception as err:
                print(err)
def main():
    # setting
    crawl_s = CrawlSelenium(explorer, url)
    crawl_s.set_loading_strategy("normal")
    # make directory
    if not os.path.exists(imgs_dir):
        os.mkdir(imgs_dir)
    # crawling
    crawl_s.crawl()
if __name__ == "__main__":
    main()

众多人期望于网络之上自动搜集新闻内容，然而却发觉现成的爬虫程序时常出现失灵状况，这背后实际上与网页结构的蜕变紧密相连。

文字爬虫的核心逻辑

文字爬虫旨在从网页那儿获取特定的文字信息，像一篇新闻报道的标题以及正文等，它的本质乃是解析网页的HTML源代码，且依据源代码的结构去定位还有提取内容，鉴于每个网站的页面布局跟代码写法均不一样，针对一个网站所写的爬虫程序，常常没办法直接应用于另一个网站，这便是网上找到的老代码可能失效的缘由，通常并非代码本身写错了，而是目标网站的页面结构已然更新了。

从目标到步骤：明确爬取流程

from bs4 import BeautifulSoup
import requests
res = requests.get('http://www.xinhuanet.com/fortune/2021-06/18/c_1127577431.htm')#新闻的网址
res.encoding = res.apparent_encoding  
# 根据网站的编码方式修改解码方式，因为网页的文字编码方式多种多样有UTF-8 GBK这些解码方式如果不一致容易发生乱码，所以解码方式最好不要统一，而是交给网页自己来决定
soup = BeautifulSoup(res.text, 'html5lib')#使用html5lib样式来解析网页，看不懂没关系
print(soup)#查看页面源代码

欲爬取一则新闻，那么首先得明晰各个步骤。第一步是要获取新闻网页的链接。第二步是朝着此链接发送网络请求，进而获取该页面完整的HTML代码。第三步，同时也是最为关键的一步，是在这一堆繁杂的代码当中，确切地寻找到并分离出新闻正文。这个过程要求你清晰地晓得你所需要的是什么，并规划好每一步怎样通过程序指令去达成。

获取与分析网页源代码

向一个网址发送请求之后，服务器返回的便是该页面中的HTML源代码，这些代码于浏览器里会被渲染成我们所见到的图文适配的页面，对于爬虫来讲，它所面对的是最为原始的、涵盖各种标签的代码，这些代码虽说看上去繁杂，然而新闻的标题、正文、发布时间等信息皆规律地嵌套于特定的标签之中，等候被识别以及提取。

data = soup.select('p')#元素选择器
news_text=''
for p in data:
    news_text += p.text.strip()
print(news_text)

定位并提取正文内容

提取新闻正文要从HTML代码里进行，这就必须对页面结构予以观察以及分析，你得借着浏览器的开发者工具，去查看跟新闻正文部分相对应的HTML标签究竟是什么，就像那。

还是

对于标签段落，当寻获这个规律之后，能够于程序里撰写相应的解析规则，致使程序仅仅抓取处于这个特定标签范围以内的文字，进而将导航栏、广告、评论区等无关信息过滤掉。

规模化采集的挑战与工具

每一篇新闻都手动去构造链接，这是不现实的，要想获得大量新闻数据，就得先批量获取新闻列表页的链接，这对于编程新手来讲是个门槛，此时能够借助一些可视化采集工具，像八爪鱼采集器，这类工具准许用户以鼠标点击的方式，去模拟浏览行为，自动进行翻页，抓取列表页里的所有新闻标题以及链接，还能导出成为结构化的表格，给后续的正文爬取提供链接库。

数据整理与后续处理

通过工具或者程序所爬取到的原始数据常常是需要进行清洗以及整理的，比如说，要把爬取而来的全部新闻数据依据频道进行分类存放，还要合并诸多文件，并且统一格式成为“标题、正文、分类”这样等的字段，在2023年的某些数据采集中，人们时常是需要为机器学习训练去准备数据的，所以明确新闻的分类属性是相当重要的，处理的时候要留意数据平衡，防止某一类新闻数量过多对后续分析造成影响。。

针对那些有着想要去尝试网络数据采集想法的新手而言，你觉得最大的阻碍究竟是去学习编程语法这件事，还是去理解始终处于不断变化状态的网页结构自身呢？欢迎在评论区域分享你个人的看法，要是感觉这篇文章存在一定帮助，请点赞给予支持。

上一篇 : 国家新闻出版署印发出版业十四五时期发展规划及多项专项规划下一篇: 新闻出版业十二五规划如何科学谋划实施？看柳斌杰报告返回列表

行业资讯

爬虫爬取新闻实战：小白如何用Python迅速学会爬取千条新闻？