Simple Python crawler exercise: News crawling on sohu.com

python crawler: sohu news crawler

python crawler exercise: sohu news crawl

helped a friend write a course design to get the title, time, and body content of the news page.
write very simple, not very complicated knowledge, should be easy to understand.
is the first step to import the libraries we need, including requests for third-party libraries. Remember to install

with PIP


    import requests
    import re
    import os

 

first get all the HTML code you need from the sohu home page


# 获得搜狐页面的内容
def get_all_url(url):
    try:
        # 获取总的html内容
        html = getHTMLText(url)
        return html
    except:
        print("failed connect")


# 获得html内容,套路内容
def getHTMLText(url):
    try:
        # requests爬虫的正常操作,访问获得返回
        r = requests.get(url)
        # 判断是否成功?
        r.raise_for_status()
        # 改变编码方式,转为UTF-8
        r.encoding = r.apparent_encoding
        # 返回html正文内容
        return r.text
    except:
        return ''

 

and then we need to parse out all the hyperlinks in the HTML, so we need to call the re library here.
if you don’t know regular expressions, learn them yourself, and I won’t go into that.
note that all url links are obtained here without analyzing whether the url is a news url. And, of course, there’s the judgment


# 分析内容,获得我需要的链接们
def parsePage(html):
    plt = []
    try:
        # findall函数的意思:(pattern, string),将所有匹配项组成一个列表,并返回该列表
        # 匹配符合url链接格式的所有内容
        plt = re.findall('http://www\.sohu\.com(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', html)

    except:
        print("介个地方出错了")
    print(plt)
    return plt



 

here also USES re library to analyze the news title, sending time and news body content. Note that different websites have different HTML idioms. When writing a crawler, it must take some time to analyze the page. Press F12 to view all the HTML code.
since I’m not sure if pictures and the like count as text, I just put in a big chunk of text.


# 正则分析获得时间与标题,嘻嘻
def title_and_time(html):
    tat = []
    try:
        # findall函数的意思:(pattern, string),将所有匹配项组成一个列表,并返回该列表
        # 获得时间,若无法匹配就返回空列表
        time = re.findall('dateUpdate" content="(.*)" />', html)
        # 获得标题,若无法匹配就返回空列表
        title = re.findall("title>(.*)</title", html)
        # 文章正文内容,若无法匹配就返回空列表
        article = re.findall('<article class="article" id="mp-editor">([\s\S]*)</article>', html)
        # 三者组成一个列表,传回去嘻嘻
        # 若其中有的为空列表,则不占用位置
        # 因此如果是正规的新闻页面,tat的长度len应该是3
        tat = title + time + article
    except:
        print("捏个地方出错了")
    return tat



 

and that’s all we need, so I’m going to put all the code out here for your reference.

# code="utf-8"
# 人生苦短,我用python
# 转行不易,请多鼓励


import requests
import re
import os


# 刚过完61儿童节的二十多岁的小朋友们上车啦,here we go



# 获得搜狐页面的内容
def get_all_url(url):
    try:
        # 获取总的html内容
        html = getHTMLText(url)
        return html
    except:
        print("failed connect")


# 获得html内容,套路内容
def getHTMLText(url):
    try:
        # requests爬虫的正常操作,访问获得返回
        r = requests.get(url)
        # 判断是否成功?
        r.raise_for_status()
        # 改变编码方式,转为UTF-8
        r.encoding = r.apparent_encoding
        # 返回html正文内容
        return r.text
    except:
        return ''

# 分析内容,获得我需要的链接们
def parsePage(html):
    plt = []
    try:
        # findall函数的意思:(pattern, string),将所有匹配项组成一个列表,并返回该列表
        # 匹配符合url链接格式的所有内容
        plt = re.findall('http://www\.sohu\.com(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', html)

    except:
        print("介个地方出错了")
    print(plt)
    return plt



# 正则分析获得时间与标题,嘻嘻
def title_and_time(html):
    tat = []
    try:
        # findall函数的意思:(pattern, string),将所有匹配项组成一个列表,并返回该列表
        # 获得时间,若无法匹配就返回空列表
        time = re.findall('dateUpdate" content="(.*)" />', html)
        # 获得标题,若无法匹配就返回空列表
        title = re.findall("title>(.*)</title", html)
        # 文章正文内容,若无法匹配就返回空列表
        article = re.findall('<article class="article" id="mp-editor">([\s\S]*)</article>', html)
        # 三者组成一个列表,传回去嘻嘻
        # 若其中有的为空列表,则不占用位置
        # 因此如果是正规的新闻页面,tat的长度len应该是3
        tat = title + time + article
    except:
        print("捏个地方出错了")
    return tat




# 以下是正式开始操作主函数

def main():
    # 开始访问搜狐网,并获得对应html代码
    html = get_all_url("http://www.sohu.com/")
    # 正则表达式分析取出新闻url链接
    sp_url = parsePage(html)
    # 设置一个列表,用于存储新闻标题以及时间
    answer = []

    # 判断保存的路径存在不 不存在就创建一个呗嘻嘻嘻嘻
    path = "新闻//"
    # 如果路径存在则返回True,否则返回false
    isExists = os.path.exists(path)

    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        # 创建目录操作函数
        os.makedirs(path)

        print(path + ' 创建成功')
    else:
        # 如果目录存在则不创建,并提示目录已存在
        print(path + ' 目录已存在')

    # 遍历每一个新闻url链接
    for url in sp_url:
        # 获得每个新闻页面的html代码
        html_sp = get_all_url(url)
        # 获得每个页面的标题以及时间
        title_time = title_and_time(html_sp)

        # 设置一个临时变量
        tt=0
        # 如果是正常新闻的话,则len==3,即可对tt重新赋值
        if(len(title_time) == 3):
            tt = title_time[0] + "\n" + title_time[1] + "\n" + title_time[2] + "\n"

        # 判断一下是正常新闻,即可写入answer
        if tt != 0:
            print(tt)  # 程序运行期间随便输出点东西,不然就很无聊
            answer.append(tt)   # 将这个新闻作为一个字符串element添加到answer最后


            # 写入文件
            try:
                # 每个新闻的题目自动生成一个txt文件
                with open("新闻//" + title_time[0] + ".txt", "w+") as f:
                    # 参数为列表,writelines可以将每一个元素写入txt
                    f.writelines(answer)
                    # 关闭文件,其实不写也一样的
                    f.close()
            except:
                pass


# 主程序运行
if __name__ == '__main__':
    main()


# 输出一句话,告诉我结束了
print("搜狐网新闻爬取已完成")

# 完工

 

because there is no Javascript dynamic generation content, so the whole code is very simple, and when I have a chance later, I will climb a js dynamic generation to show you.

Read More: