复制代码 代码如下:
#!/usr/bin/python
#-*-coding:utf-8-*-
# JCrawler
# Author: Jam <810441377@qq.com>
import time
import urllib2
from bs4 import BeautifulSoup
# 目标站点
TargetHost = "http://adirectory.blog.com"
# User Agent
UserAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.36'
# 链接采集规则
# 目录链接采集规则
CategoryFind = [{'findMode':'find','findTag':'div','rule':{'id':'cat-nav'}},
{'findMode':'findAll','findTag':'a','rule':{}}]
# 文章链接采集规则
ArticleListFind = [{'findMode':'find','findTag':'div','rule':{'id':'content'}},
{'findMode':'findAll','findTag':'h2','rule':{'class':'title'}},
{'findMode':'findAll','findTag':'a','rule':{}}]
# 分页URL规则
PageUrl = 'page/#page/'
PageStart = 1
PageStep = 1
PageStopHtml = '404: Page Not Found'
def GetHtmlText(url):
request = urllib2.Request(url)
request.add_header('Accept', "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp")
request.add_header('Accept-Encoding', "*")
request.add_header('User-Agent', UserAgent)
return urllib2.urlopen(request).read()
def ArrToStr(varArr):
returnStr = ""
for s in varArr:
returnStr += str(s)
return returnStr
def GetHtmlFind(htmltext, findRule):
findReturn = BeautifulSoup(htmltext)
returnText = ""
for f in findRule:
if returnText != "":
findReturn = BeautifulSoup(returnText)
if f['findMode'] == 'find':
findReturn = findReturn.find(f['findTag'], f['rule'])
if f['findMode'] == 'findAll':
findReturn = findReturn.findAll(f['findTag'], f['rule'])
returnText = ArrToStr(findReturn)
return findReturn
def GetCategory():
categorys = [];
htmltext = GetHtmlText(TargetHost)
findReturn = GetHtmlFind(htmltext, CategoryFind)
for tag in findReturn:
print "[G]->Category:" + tag.string + "|Url:" + tag['href']
categorys.append({'name': tag.string, 'url': tag['href']})
return categorys;
def GetArticleList(categoryUrl):
articles = []
page = PageStart
#pageUrl = PageUrl
while True:
htmltext = ""
pageUrl = PageUrl.replace("#page", str(page))
print "[G]->PageUrl:" + categoryUrl + pageUrl
while True:
try:
htmltext = GetHtmlText(categoryUrl + pageUrl)
break
except urllib2.HTTPError,e:
print "[E]->HTTP Error:" + str(e.code)
if e.code == 404:
htmltext = PageStopHtml
break
if e.code == 504:
print "[E]->HTTP Error 504: Gateway Time-out, Wait"
time.sleep(5)
else:
break
if htmltext.find(PageStopHtml) >= 0:
print "End Page."
break
else:
findReturn = GetHtmlFind(htmltext, ArticleListFind)
for tag in findReturn:
if tag.string != None and tag['href'].find(TargetHost) >= 0:
print "[G]->Article:" + tag.string + "|Url:" + tag['href']
articles.append({'name': tag.string, 'url': tag['href']})
page += 1
return articles;
print "[G]->GetCategory"
Mycategorys = GetCategory();
print "[G]->GetCategory->Success."
time.sleep(3)
for category in Mycategorys:
print "[G]->GetArticleList:" + category['name']
GetArticleList(category['url'])
免责声明:本站资源来自互联网收集,仅供用于学习和交流,请遵循相关法律法规,本站一切资源不代表本站立场,如有侵权、后门、不妥请联系本站删除!
稳了!魔兽国服回归的3条重磅消息!官宣时间再确认!
昨天有一位朋友在大神群里分享,自己亚服账号被封号之后居然弹出了国服的封号信息对话框。
这里面让他访问的是一个国服的战网网址,com.cn和后面的zh都非常明白地表明这就是国服战网。
而他在复制这个网址并且进行登录之后,确实是网易的网址,也就是我们熟悉的停服之后国服发布的暴雪游戏产品运营到期开放退款的说明。这是一件比较奇怪的事情,因为以前都没有出现这样的情况,现在突然提示跳转到国服战网的网址,是不是说明了简体中文客户端已经开始进行更新了呢?
更新日志
- 群星.2022-福茂巨星·时空之轮日本唱片志系列DISC2范晓萱-RAIN【福茂】【WAV+CUE】
- 王闻-《男人四十4》[正版CD低速原抓WAV+CUE]
- 青燕子-八只眼演唱组《爱心》[WAV+CUE]
- 祁露想着你的好》WAV+CUE
- 陈致逸《赴梦之约 游戏主题原声音乐》[FLAC/分轨][159.96MB]
- 贵族音乐《睡眠自然流水声 ASMR白噪音背景音》[320K/MP3][155.72MB]
- 贵族音乐《睡眠自然流水声 ASMR白噪音背景音》[FLAC/分轨][857.58MB]
- 朱昕嵘《琴意绵绵6N纯银SQCD》[WAV+CUE]
- 降央卓玛《草原情6N纯银SQCD》WAV+CUE
- 傲日格乐《黑马琴HQCD》[WAV+CUE]
- 群星.2022-福茂巨星·时空之轮日本唱片志系列DISC3范晓萱-自言自语【福茂】【WAV+CUE】
- 群星.2022-福茂巨星·时空之轮日本唱片志系列DISC4那英-白天不懂夜的黑【福茂】【WAV+CUE】
- 群星.2015-华丽上班族电影原声大碟【大右音乐】【WAV+CUE】
- 陈粒《乌有乡地图》[320K/MP3][21.81MB]
- 陈粒《乌有乡地图》[FLAC/分轨][398.39MB]