这是期末考的补救机会作业
老师要我完成的是剩下的部份,底下说明以后的部分
import urllib.request
from bs4 import BeautifulSoup
def getText(url, encoding='utf-8'):
#url = 'http://www.voafanti.com/gate/big5/www.voachinese.com/content/lw1939-pale-in-comparison/1825297.html'
html = urllib.request.urlopen(urllib.request.Request(url))
soup = BeautifulSoup(html, from_encoding=encoding)
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
import re
def getVOA(url):
res=getText(url)
lst = re.split(r'\n', res)
text=''
first_hint=False #'打印'
second_hint=False #'美国之音'
start=True
for e in lst:
if re.match(r'打印', e):
if second_hint:
second_hint=False
else:
first_hint=True
continue
if first_hint and re.match(r'美国之音', e):
second_hint=True
continue
if second_hint and re.match(r'学个词-\d+-\w+', e):
start=True
if second_hint and start:
text+=e
return text
urls=['http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base64-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTktc3RpY2tlci1zaG9jaw~~/1943689.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTgtZ3JhY2UtcGVyaW9k/1943688.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTctY2l2aWwtd2Fy/1943687.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTYtZGlzcGFyYWdl/1943685.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTUtcHJvaGliaXQ~/1939100.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTQtc3dpdGNo/1939098.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTMtdm9pY2U~/1939094.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTItbWFzY290/1939093.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTEtZXhjaGFuZ2U~/1939092.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNTAtYnJlYWR3aW5uZXI~/1935520.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDktYW5vbnltb3Vz/1935516.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDgtZHJhZnQ~/1935513.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDctaWRlbnRpZnk~/1935511.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDYtbmF0aW9ud2lkZQ~~/1935509.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTMzNy1jaGFyaXR5LQ~~/1933985.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDUtY29udHJpYnV0aW9u/1928911.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDQtY29udGFnaW91cw~~/1928909.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDMtYXNzZXNz/1928907.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDItZ3JhZmZpdGk~/1928906.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDEtZnVuZGluZw~~/1928904.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base64-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwNDAtYWNjb21wbGlzaG1lbnQ~/1925331.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base64-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzktcHVibGljLXRyYW5zaXQ~/1925330.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzgtZGF0YWJhc2U~/1925329.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzctaGVhcmluZw~~/1925327.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzYtcmFudA~~/1925325.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/media/video/1936377.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzUtcHJvZm91bmQ~/1919322.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzQtcGxhbi1i/1919321.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzMtdG94aWM~/1919314.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base64-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzItb24tdGhlLWJyaW5rLW9m/1919312.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzEtY29tcGVs/1919311.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMzAtbWF4LW91dA~~/1914530.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjktc2NyZWVuLXRpbWU~/1914527.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjgtdW5leHBlY3RlZA~~/1914522.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjctZGl2ZXJzZQ~~/1914519.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjYtd2lkZS1yYW5naW5n/1914515.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjUtYXQtbm8tY2hhcmdl/1914512.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base60-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjQtcmVoYWItY2VudGVy/1914508.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjMtY29tcGxhaW50/1914506.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjItc3VzcGljaW9u/1914504.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjEtb3V0bGF3/1914503.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/media/video/1936263.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMjAtbGV0aGFs/1904640.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base52-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMTktcG9pc2Vk/1904636.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMTgtbWFyaXRpbWU~/1904632.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base64-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMTctc3VzdGFpbmFiaWxpdHk~/1904630.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base48-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMTYtYmFy/1904626.html',
'http://www.voafanti.com/gate/big5/www.voachinese.com/content/-ifbase4-base56-JWU1JWFkJWE2JWU0JWI4JWFhJWU4JWFmJThkLTIwMTUtc2tlbGV0b24~/1899500.html']
#撰写循环将urls中的每一个连结的文字内容个别存入一个文字档
#文字档档名以连结的档名为档名, 附档名则将html改成txt.
#例如urls[0]的档名为1943689, 故存成的文字档必须是1943689.txt
#以下示范程式可以显示 1943689的文字内容, 但你要写循环来批次
#读取与写入内容. 完成后请email给我程式档及所撷取的文字档.(可以用zip压缩)
想请教怎么把文字档档名储存成每个网址后面的数字
循环的部分也不太懂到底该怎么写....
各位可以救救我吗