更新时间:2022-10-04 15:22:24
BeautifulSoup是python一种原生的解析文件的模块,区别于scrapy,scrapy是一种封装好的框架,只需要按结构进行填空,而BeautifulSoup就需要自己造***,相对scrapy麻烦一点但也更加灵活一些
以爬取百度贴吧内容示例说明。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
# -*- coding:utf-8 -*- __author__ = 'fengzhankui'
import urllib2
from bs4 import BeautifulSoup
class Item( object ):
title = None
firstAuthor = None
firstTime = None
reNum = None
content = None
lastAuthor = None
lastTime = None
class GetTiebaInfo( object ):
def __init__( self ,url):
self .url = url
self .pageSum = 5
self .urls = self .getUrls( self .pageSum)
self .items = self .spider( self .urls)
self .pipelines( self .items)
def getUrls( self ,pageSum):
urls = []
pns = [ str (i * 50 ) for i in range (pageSum)]
ul = self .url.split( '=' )
for pn in pns:
ul[ - 1 ] = pn
url = '=' .join(ul)
urls.append(url)
return urls
def spider( self ,urls):
items = []
for url in urls:
htmlContent = self .getResponseContent(url)
soup = BeautifulSoup(htmlContent, 'lxml' )
tagsli = soup.find_all( 'li' , class_ = [ 'j_thread_list' , 'clearfix' ])[ 2 :]
for tag in tagsli:
if tag.find( 'div' ,attrs = { 'class' : 'threadlist_abs threadlist_abs_onlyline ' }) = = None :
continue
item = Item()
item.title = tag.find( 'a' ,attrs = { 'class' : 'j_th_tit' }).get_text().strip()
item.firstAuthor = tag.find( 'span' ,attrs = { 'class' : 'frs-author-name-wrap' }).a.get_text().strip()
item.firstTime = tag.find( 'span' , attrs = { 'title' : u '创建时间' .encode( 'utf8' )}).get_text().strip()
item.reNum = tag.find( 'span' , attrs = { 'title' : u '回复' .encode( 'utf8' )}).get_text().strip()
item.content = tag.find( 'div' ,attrs = { 'class' : 'threadlist_abs threadlist_abs_onlyline ' }).get_text().strip()
item.lastAuthor = tag.find( 'span' ,attrs = { 'class' : 'tb_icon_author_rely j_replyer' }).a.get_text().strip()
item.lastTime = tag.find( 'span' , attrs = { 'title' : u '最后回复时间' .encode( 'utf8' )}).get_text().strip()
items.append(item)
return items
def pipelines( self ,items):
with open ( 'tieba.txt' , 'a' ) as fp:
for item in items:
fp.write( 'title:' + item.title.encode( 'utf8' ) + '\t' )
fp.write( 'firstAuthor:' + item.firstAuthor.encode( 'utf8' ) + '\t' )
fp.write( 'reNum:' + item.reNum.encode( 'utf8' ) + '\t' )
fp.write( 'content:' + item.content.encode( 'utf8' ) + '\t' )
fp.write( 'lastAuthor:' + item.lastAuthor.encode( 'utf8' ) + '\t' )
fp.write( 'lastTime:' + item.lastTime.encode( 'utf8' ) + '\t' )
fp.write( '\n' )
def getResponseContent( self ,url):
try :
response = urllib2.urlopen(url.encode( 'utf8' ))
except :
print 'fail'
else :
return response.read()
if __name__ = = '__main__' :
url = u 'http://tieba.baidu.com/f?kw=战狼2&ie=utf-8&pn=50'
GetTiebaInfo(url)
|
代码说明:
这个例子是按照scrapy那样的结构,定义一个item类,然后抽取url中的html,再然后交给第三个方法进行处理,由于贴吧都有置顶的条目,因为匹配class类名默认都是按in处理的,不能and处理,所以不能精确匹配类名,在tag循环过滤的时候才会有过滤置顶内容的条件筛选