Today, I try to use ptyhon to do a function of grabbing web content and generating word document. The function is very simple. Make a record for future use.
The third-party component Python docx is used to generate word, so install the third-party component first. As Python installed under Windows does not have the module of setuptools by default, you need to install the module of setuptools first
1. It can be found on the official website of Python https://bootstrap.pypa.io/ez_ setup.py , save the code locally and execute: Python EZ_ setup.py
2. Download Python docx( https://pypi.python.org/pypi/python-docx/0.7.4 )After downloading, unzip and go to XXX / python-docx-0.7.4 to install Python docx: Python setup.py install
In this way, the installation of Python docx is successful. You can use it to operate word documents. Here is a reference for the generation of word documents https://python-docx.readthedocs.org/en/latest/index.html
HTML parsing uses sgmlparser in sgmllib, and URL content acquisition uses urllib and urllib2
to parse
The code is as follows:
# -*- coding: cp936 -*-
from sgmllib import SGMLParser
import os
import sys
import urllib
import urllib2
from docx import Document
from docx.shared import Inches
import time
##Get the url to be parsed
class GetUrl(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.start=False
self.urlArr=[]
def start_div(self,attr):
for name,value in attr:
if value=="ChairmanCont Bureau":#Fixed values in page js
self.start=True
def end_div(self):
self.start=False
def start_a(self,attr):
if self.start:
for name,value in attr:
self.urlArr.append(value)
def getUrlArr(self):
return self.urlArr
##Parse the url obtained above to get useful data
class getManInfo(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.start=False
self.p=False
self.dl=False
self.manInfo=[]
self.subInfo=[]
def start_div(self,attr):
for name,value in attr:
if value=="SpeakerInfo":#Fixed values in page js
self.start=True
def end_div(self):
self.start=False
def start_p(self,attr):
if self.dl:
self.p=True
def end_p(self):
self.p=False
def start_img(self,attr):
if self.dl:
for name,value in attr:
self.subInfo.append(value)
def handle_data(self,data):
if self.p:
self.subInfo.append(data.decode('utf-8'))
def start_dl(self,attr):
if self.start:
self.dl=True
def end_dl(self):
self.manInfo.append(self.subInfo)
self.subInfo=[]
self.dl=False
def getManInfo(self):
return self.manInfo
urlSource="http://www.XXX"
sourceData=urllib2.urlopen(urlSource).read()
startTime=time.clock()
##get urls
getUrl=GetUrl()
getUrl.feed(sourceData)
urlArr=getUrl.getUrlArr()
getUrl.close()
print "get url use:" + str((time.clock() - startTime))
startTime=time.clock()
##get maninfos
manInfos=getManInfo()
for url in urlArr:#one url one person
data=urllib2.urlopen(url).read()
manInfos.feed(data)
infos=manInfos.getManInfo()
manInfos.close()
print "get maninfos use:" + str((time.clock() - startTime))
startTime=time.clock()
#word
saveFile=os.getcwd()+"\\xxx.docx"
doc=Document()
##word title
doc.add_heading("HEAD".decode('gbk'),0)
p=doc.add_paragraph("HEADCONTENT:".decode('gbk'))
##write info
for infoArr in infos:
i=0
for info in infoArr:
if i==0:##img url
arr1=info.split('.')
suffix=arr1[len(arr1)-1]
arr2=info.split('/')
preffix=arr2[len(arr2)-2]
imgFile=os.getcwd()+"\\imgs\\"+preffix+"."+suffix
if not os.path.exists(os.getcwd()+"\\imgs"):
os.mkdir(os.getcwd()+"\\imgs")
imgData=urllib2.urlopen(info).read()
try:
f=open(imgFile,'wb')
f.write(imgData)
f.close()
doc.add_picture(imgFile,width=Inches(1.25))
os.remove(imgFile)
except Exception as err:
print (err)
elif i==1:
doc.add_heading(info+":",level=1)
else:
doc.add_paragraph(info,style='ListBullet')
i=i+1
doc.save(saveFile)
print "word use:" + str((time.clock() - startTime))