1、爬虫入门程序

urllib2 实现下载网页的三种方式

print "第二种方法"
request = urllib2.Request(url)
#模拟Mozilla浏览器进行爬虫
request.add_header("user-agent","Mozilla/5.0")
response2 = urllib2.urlopen(request)
print response2.getcode()
print len(response2.read())

print "第三种方法"
cookie = cookielib.CookieJar()
#加入urllib2处理cookie的能力
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
urllib2.install_opener(opener)
response3 = urllib2.urlopen(url)
print response3.getcode()
print len(response3.read())
print cookie

使用 Beautiful Soup 解析 html 文件


#!/usr/bin/python
# -*- coding: UTF-8 -*-
 
import re
 
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>


#创建一个BeautifulSoup解析对象
soup = BeautifulSoup(html_doc,"html.parser",from_encoding="utf-8")
#获取所有的链接
links = soup.find_all('a')
print "所有的链接"
for link in links:
    print link.name,link['href'],link.get_text()
 
print "获取特定的URL地址"
link_node = soup.find('a',href="http://example.com/elsie")
print link_node.name,link_node['href'],link_node['class'],link_node.get_text()
 
print "正则表达式匹配"
link_node = soup.find('a',href=re.compile(r"ti"))
print link_node.name,link_node['href'],link_node['class'],link_node.get_text()
 
print "获取P段落的文字"
p_node = soup.find('p',class_='story')
print p_node.name,p_node['class'],p_node.get_text()

2.爬虫程序添加 header，然后post请求

设置Header代码

import urllib  
import urllib2  
 
url = 'http://www.server.com/login'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'  
values = {'username' : 'cqc',  'password' : 'XXXX' }  
headers = { 'User-Agent' : user_agent }  
data = urllib.urlencode(values)  
request = urllib2.Request(url, data, headers)  
response = urllib2.urlopen(request)  
page = response.read()

post方式

import urllib
import urllib2
 
values = {"username":"[email protected]","password":"XXXX"}
data = urllib.urlencode(values) 
url = "https://passport.csdn.net/account/login?from=http://my.csdn.net/my/mycsdn"
request = urllib2.Request(url,data)
response = urllib2.urlopen(request)
print response.read()

3、爬虫程序添加cookie

利用 cookie 模拟网站登录

import urllib
import urllib2
import cookielib
 
filename = 'cookie.txt'
#声明一个MozillaCookieJar对象实例来保存cookie，之后写入文件
cookie = cookielib.MozillaCookieJar(filename)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
postdata = urllib.urlencode({
			'stuid':'201200131012',
			'pwd':'23342321'
		})
#登录教务系统的URL
loginUrl = 'http://jwxt.sdu.edu.cn:7890/pls/wwwbks/bks_login2.login'
#模拟登录，并把cookie保存到变量
result = opener.open(loginUrl,postdata)
#保存cookie到cookie.txt中
cookie.save(ignore_discard=True, ignore_expires=True)
#利用cookie请求访问另一个网址，此网址是成绩查询网址
gradeUrl = 'http://jwxt.sdu.edu.cn:7890/pls/wwwbks/bkscjcx.curscopre'
#请求访问成绩查询网址
result = opener.open(gradeUrl)
print result.read()

4.正则表达式

. 表示任何单个字符
[ ] 字符集，对单个字符给出取值范围
[^ ] 非字符集，对单个字符给出排除范围
* 前一个字符0次或无限次扩展
+ 前一个字符1次或无限次扩展
? 前一个字符0次或1次扩展
| 左右表达式任意一个
{m} 扩展前一个字符m次
{m,n} 扩展前一个字符m至n次（含n）
^ 匹配字符串开头
$ 匹配字符串结尾
( ) 分组标记，内部只能使用| 操作符
\d 数字
\w 单词字符

爬虫基础

1、爬虫入门程序

urllib2 实现下载网页的三种方式

使用 Beautiful Soup 解析 html 文件

2.爬虫程序添加 header，然后post请求

post方式

3、爬虫程序添加cookie

利用 cookie 模拟网站登录

4.正则表达式

相关推荐

爬虫基础

1、爬虫入门程序

urllib2 实现下载网页的三种方式

使用 Beautiful Soup 解析 html 文件

2.爬虫程序添加header，然后post请求

post方式

3、爬虫程序添加cookie

利用 cookie 模拟网站登录

4.正则表达式

相关推荐

2.爬虫程序添加 header，然后post请求