Today, I try to use ptyhon to do a function of grabbing web content and generating word document. The function is very simple. Make a record for future use.
The third-party component Python docx is used to generate word, so install the third-party component first. As Python installed under Windows does not have the module of setuptools by default, you need to install the module of setuptools first
1. It can be found on the official website of Python https://bootstrap.pypa.io/ez_ setup.py , save the code locally and execute: Python EZ_ setup.py
2. Download Python docx( https://pypi.python.org/pypi/python-docx/0.7.4 )After downloading, unzip and go to XXX / python-docx-0.7.4 to install Python docx: Python setup.py install
In this way, the installation of Python docx is successful. You can use it to operate word documents. Here is a reference for the generation of word documents https://python-docx.readthedocs.org/en/latest/index.html
HTML parsing uses sgmlparser in sgmllib, and URL content acquisition uses urllib and urllib2
to parse
The code is as follows:
# -*- coding: cp936 -*-
from sgmllib import SGMLParser
import os
import sys
import urllib
import urllib2
from docx import Document
from docx.shared import Inches
import time
##Get the url to be parsed
class GetUrl(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.start=False
self.urlArr=[]
def start_div(self,attr):
for name,value in attr:
if value=="ChairmanCont Bureau":#Fixed values in page js
self.start=True
def end_div(self):
self.start=False
def start_a(self,attr):
if self.start:
for name,value in attr:
self.urlArr.append(value)
def getUrlArr(self):
return self.urlArr
##Parse the url obtained above to get useful data
class getManInfo(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.start=False
self.p=False
self.dl=False
self.manInfo=[]
self.subInfo=[]
def start_div(self,attr):
for name,value in attr:
if value=="SpeakerInfo":#Fixed values in page js
self.start=True
def end_div(self):
self.start=False
def start_p(self,attr):
if self.dl:
self.p=True
def end_p(self):
self.p=False
def start_img(self,attr):
if self.dl:
for name,value in attr:
self.subInfo.append(value)
def handle_data(self,data):
if self.p:
self.subInfo.append(data.decode('utf-8'))
def start_dl(self,attr):
if self.start:
self.dl=True
def end_dl(self):
self.manInfo.append(self.subInfo)
self.subInfo=[]
self.dl=False
def getManInfo(self):
return self.manInfo
urlSource="http://www.XXX"
sourceData=urllib2.urlopen(urlSource).read()
startTime=time.clock()
##get urls
getUrl=GetUrl()
getUrl.feed(sourceData)
urlArr=getUrl.getUrlArr()
getUrl.close()
print "get url use:" + str((time.clock() - startTime))
startTime=time.clock()
##get maninfos
manInfos=getManInfo()
for url in urlArr:#one url one person
data=urllib2.urlopen(url).read()
manInfos.feed(data)
infos=manInfos.getManInfo()
manInfos.close()
print "get maninfos use:" + str((time.clock() - startTime))
startTime=time.clock()
#word
saveFile=os.getcwd()+"\\xxx.docx"
doc=Document()
##word title
doc.add_heading("HEAD".decode('gbk'),0)
p=doc.add_paragraph("HEADCONTENT:".decode('gbk'))
##write info
for infoArr in infos:
i=0
for info in infoArr:
if i==0:##img url
arr1=info.split('.')
suffix=arr1[len(arr1)-1]
arr2=info.split('/')
preffix=arr2[len(arr2)-2]
imgFile=os.getcwd()+"\\imgs\\"+preffix+"."+suffix
if not os.path.exists(os.getcwd()+"\\imgs"):
os.mkdir(os.getcwd()+"\\imgs")
imgData=urllib2.urlopen(info).read()
try:
f=open(imgFile,'wb')
f.write(imgData)
f.close()
doc.add_picture(imgFile,width=Inches(1.25))
os.remove(imgFile)
except Exception as err:
print (err)
elif i==1:
doc.add_heading(info+":",level=1)
else:
doc.add_paragraph(info,style='ListBullet')
i=i+1
doc.save(saveFile)
print "word use:" + str((time.clock() - startTime))
Read More:
- Python parses XML files (parses, updates, writes)
- Python: How to Encode the File (including HTML, TXT, Doc, etc.)
- CV: How to extracts the part of the picture with the specified color
- Python: How to handles HTML escape characters
- Extracting Data from XML (Using Python to Access Web Data)
- How to Solve word2vec Module Error: AttributeError & UnicodeDecodeError
- Python: How to Reshape the data in Pandas DataFrame
- [CHM] Python: How to Extract CHM Data
- Python automatically generates the requirements file for the current project
- How to Solve Python WARNING: Ignoring invalid distribution -ip (e:\python\python_dowmload\lib\site-packages)
- Python: How to Create an Automatic Recording Program
- Python: How to Obtaining Publick IP Quickly
- How to Solve Python ImportError: cannot import name UnrewindableBodyError
- Python: How to Disable InsecureRequestWarning error
- How to Fix “HTTP error 403: forbidden” in Python 3. X
- Python 3.X error: valueerror: data type must provide an itemsize
- Python: How to Auto Add Watermark to PDF
- Python: Panda scramble data
- How to Fix keyerror in Python dictionary lookup
- An introduction to sys modules in Python and how packages are imported and used