Python: How to parses HTML, extracts data, and generates word documents

Today, I try to use ptyhon to do a function of grabbing web content and generating word document. The function is very simple. Make a record for future use.

The third-party component Python docx is used to generate word, so install the third-party component first. As Python installed under Windows does not have the module of setuptools by default, you need to install the module of setuptools first

1. It can be found on the official website of Python https://bootstrap.pypa.io/ez_ setup.py , save the code locally and execute: Python EZ_ setup.py

2. Download Python docx( https://pypi.python.org/pypi/python-docx/0.7.4 )After downloading, unzip and go to XXX / python-docx-0.7.4 to install Python docx: Python setup.py install

In this way, the installation of Python docx is successful. You can use it to operate word documents. Here is a reference for the generation of word documents https://python-docx.readthedocs.org/en/latest/index.html

 

HTML parsing uses sgmlparser in sgmllib, and URL content acquisition uses urllib and urllib2
to parse

The code is as follows:

# -*- coding: cp936 -*-
from sgmllib import SGMLParser
import os
import sys
import urllib
import urllib2
from docx import Document
from docx.shared import Inches
import time

##Get the url to be parsed
class GetUrl(SGMLParser):
    def __init__(self):
        SGMLParser.__init__(self)
        self.start=False
        self.urlArr=[]


    def start_div(self,attr):
        for name,value in attr:
            if value=="ChairmanCont Bureau":#Fixed values in page js
                self.start=True


    def end_div(self):
        self.start=False


    def start_a(self,attr):
        if self.start:
            for name,value in attr:
                self.urlArr.append(value)
            


    def getUrlArr(self):
        return self.urlArr
    
##Parse the url obtained above to get useful data
class getManInfo(SGMLParser):
    def __init__(self):
        SGMLParser.__init__(self)
        self.start=False
        self.p=False
        self.dl=False
        self.manInfo=[]
        self.subInfo=[]

    def start_div(self,attr):
        for name,value in attr:
            if value=="SpeakerInfo":#Fixed values in page js
                self.start=True

    def end_div(self):
        self.start=False

    def start_p(self,attr):
        if self.dl:
            self.p=True

    def end_p(self):
        self.p=False

    def start_img(self,attr):
        if self.dl:
            for name,value in attr:
                self.subInfo.append(value)
        


    def handle_data(self,data):
        if self.p:
            self.subInfo.append(data.decode('utf-8'))


    def start_dl(self,attr):
        if self.start:
            self.dl=True

    def end_dl(self):
        self.manInfo.append(self.subInfo)
        self.subInfo=[]
        self.dl=False

    def getManInfo(self):
        return self.manInfo



                

urlSource="http://www.XXX"
sourceData=urllib2.urlopen(urlSource).read()

startTime=time.clock()
##get urls
getUrl=GetUrl()
getUrl.feed(sourceData)
urlArr=getUrl.getUrlArr()
getUrl.close()
print "get url use:" + str((time.clock() - startTime))
startTime=time.clock()


##get maninfos
manInfos=getManInfo()
for url in urlArr:#one url one person
    data=urllib2.urlopen(url).read()
    manInfos.feed(data)
infos=manInfos.getManInfo()
manInfos.close()
print "get maninfos use:" + str((time.clock() - startTime))
startTime=time.clock()

#word
saveFile=os.getcwd()+"\\xxx.docx"
doc=Document()
##word title
doc.add_heading("HEAD".decode('gbk'),0)
p=doc.add_paragraph("HEADCONTENT:".decode('gbk'))


##write info
for infoArr in infos:
    i=0
    for info in infoArr:
        if i==0:##img url
            arr1=info.split('.')
            suffix=arr1[len(arr1)-1]
            arr2=info.split('/')
            preffix=arr2[len(arr2)-2]
            imgFile=os.getcwd()+"\\imgs\\"+preffix+"."+suffix
            if not os.path.exists(os.getcwd()+"\\imgs"):
                os.mkdir(os.getcwd()+"\\imgs")
            imgData=urllib2.urlopen(info).read()

            try:
                f=open(imgFile,'wb')
                f.write(imgData)
                f.close()
                doc.add_picture(imgFile,width=Inches(1.25))
                os.remove(imgFile)
            except Exception as err:
                print (err)
  
            
        elif i==1:
            doc.add_heading(info+":",level=1)
        else:
            doc.add_paragraph(info,style='ListBullet')
        i=i+1

    
doc.save(saveFile)
print "word use:" + str((time.clock() - startTime))

 

Read More: