pip install beautifulsoup4 selenium pandas
1 | #install the Pillow library (used for image processing) |
2 | pip install Pillow |
3 | #install the requests library (used to send HTTP requests) |
4 | pip install requests |
1 | import pandas as pd |
2 | from bs4 import BeautifulSoup |
3 | from selenium import webdriver |
4 | driver= webdriver.Chrome(executable_path=’/nix/path/to/webdriver/executable’) |
5 | driver.get(‘https://your.url/here?yes=brilliant’) |
6 | results= [] |
7 | content= driver.page_source |
8 | soup=BeautifulSoup(content) |
1 | # Example on how to define a function and select custom arguments for the |
2 | # code that goes into it. |
3 | def function_name(arguments): |
4 | # Function body goes here. |
我们将把URL爬虫移到定义的函数中。此外,我们将重复使用“ Python Web Scraping教程:循序渐进 ”一文中使用的相同代码,并将其重新用于爬取完整的URL。
1 | for a in soup.findAll(attrs={‘class’: ‘class’}): |
2 | name = a.find(‘a’) |
3 | if name notin results: |
4 | results.append(name.text) |
1 | #picking a name that represents the functions will be useful later on. |
2 | def parse_image_urls(classes, location, source): |
3 | for a in soup.findAll(attrs={‘class’: classes}): |
4 | name = a.find(location) |
5 | if name notin results: |
6 | results.append(name.get(source)) |
注意,我们现在以不同的方式来append指定内容。使用函数“get()”并向其中添加一个新参数“ source”,嵌套在append里, 以替代直接append文本的方法。我们使用“source”来指向网站中存储图片链接的字段,这些字段大概率会在“ src”,“ data-src”或其他类似的HTML标签中。
parse_image_urls(“blog-card__link”,”img”, “src”)
1 | import pandas as pd |
2 | from bs4 import BeautifulSoup |
3 | from selenium import webdriver |
4 | |
5 | driver= webdriver.Chrome(executable_path=’/nix/path/to/webdriver/executable’) |
6 | driver.get(‘https://your.url/here?yes=brilliant’) |
7 | results= [] |
8 | content= driver.page_source |
9 | soup=BeautifulSoup(content) |
10 | |
11 | |
12 | def parse_image_urls(classes, location, source): |
13 | for a in soup.findAll(attrs={‘class’: classes}): |
14 | name = a.find(location) |
15 | if name notin results: |
16 | results.append(name.get(source)) |
17 | |
18 | parse_image_urls(“blog-card__link”, “img”, “src”) |
由于我们有时要导出抓取的数据且我们之前已经使用过pandas,因此可以通过将所有内容输出到“ .csv”文件中进行检查。如果需要,我们可以通过这种方式检查任何可能的语义错误。
df= pd.DataFrame(“links”: results})
df.to_csv(‘links.csv’,index=False, encoding=’utf-8′)
1 | #import library requests to send HTTP requests |
2 | import requests |
3 | for b in results: |
4 | #add the content of the url to a variable |
5 | image_content = requests.get(b).content |
#io manages file-related in/out operations
import io
#creates a byte object out of image_content and point the variable image_file to it
image_file= io.BytesIO(image_content)
#we use Pillow to convert our object to an RGB image
from PIL import Image
image= Image.open(image_file).convert(‘RGB’)
1 | #pathlib let’s us point to specific locations. Will be used to save our images. |
2 | import pathlib |
3 | #hashlib allows us to get hashes. We will be using sha1 to name our images. |
4 | import hashlib |
5 | #sets a file_path variable which is pointed to |
6 | #our directory and creates a file based on #the sha1 hash of ‘image_content’ |
7 | #and uses .hexdigest to convert it into a string. |
8 | file_path= pathlib.Path(‘nix/path/to/test’, hashlib.sha1(image_content).hexdigest()[:10] +’.png’) |
9 | image.save(file_path, “PNG”, quality=80) |
1 | import hashlib |
2 | import io |
3 | from pathlib importPath |
4 | import pandas as pd |
5 | import requests |
6 | from bs4 importBeautifulSoup |
7 | from PIL import Image |
8 | from selenium import webdriver |
9 | |
10 | driver= webdriver.Chrome(executable_path=’/nix/path/to/webdriver/executable’) |
11 | driver.get(‘https://your.url/here?yes=brilliant’) |
12 | driver.execute_script(“window.scrollTo(0, document.body.scrollHeight);”) |
13 | results= [] |
14 | content= driver.page_source |
15 | soup=BeautifulSoup(content) |
16 | |
17 | |
18 | def gets_url(classes, location, source): |
19 | results = [] |
20 | for a in soup.findAll(attrs={‘class’: classes}): |
21 | name = a.find(location) |
22 | if name notin results: |
23 | results.append(name.get(source)) |
24 | return results |
25 | |
26 | |
27 | driver.quit() |
28 | |
29 | if __name__ ==”__main__”: |
30 | returned_results =gets_url(“blog-card__link”, “img”, “src”) |
31 | for b in returned_results:: |
32 | image_content = requests.get(b).content |
33 | image_file = io.BytesIO(image_content) |
34 | image =Image.open(image_file).convert(‘RGB’) |
35 | file_path = pathlib.Path(‘nix/path/to/test’, hashlib.sha1(image_content).hexdigest()[:10] +’.png’) |
36 | image.save(file_path, “PNG”, quality=80) |
- 通过定义“file_path”变量,将图片输出到我们选择的文件夹中。
- Python输出403 HTTP访问报错。
每当我们使用请求库将请求发送到目标服务器时,都会分配给我们一个默认的user-agent“Python-urllib / version.number”。某些网络服务商可能会特地去阻止这些user-agent,因为它们肯定是机器人。幸运的是,请求库使我们能够分配所需的任何user-agent(或整个header):
image_content= requests.get(b, headers={‘User-agent’: ‘Mozilla/5.0’}).content
在大多数情况下,添加user-agent就足够了。也有更复杂的情况,服务器可能会尝试检查HTTP header文件的其他部分以确认它是真实用户。
1 | import io |
2 | import pathlib |
3 | import hashlib |
4 | import pandas as pd |
5 | import requests |
6 | from bs4 importBeautifulSoup |
7 | from PIL import Image |
8 | from selenium import webdriver |
9 | |
10 | |
11 | def get_content_from_url(url): |
12 | driver = webdriver.Chrome() # add “executable_path=” if driver not in running directory |
13 | driver.get(url) |
14 | driver.execute_script(“window.scrollTo(0, document.body.scrollHeight);”) |
15 | page_content = driver.page_source |
16 | driver.quit() # We do not need the browser instance for further steps. |
17 | return page_content |
18 | |
19 | |
20 | def parse_image_urls(content, classes, location, source): |
21 | soup =BeautifulSoup(content) |
22 | results = [] |
23 | for a in soup.findAll(attrs={“class”: classes}): |
24 | name = a.find(location) |
25 | if name notin results: |
26 | results.append(name.get(source)) |
27 | return results |
28 | |
29 | |
30 | def save_urls_to_csv(image_urls): |
31 | df = pd.DataFrame({“links”: image_urls}) |
32 | df.to_csv(“links.csv”, index=False, encoding=”utf-8″) |
33 | |
34 | |
35 | def get_and_save_image_to_file(image_url, output_dir): |
36 | response = requests.get(image_url, headers={“User-agent”: “Mozilla/5.0”}) |
37 | image_content = response.content |
38 | image_file = io.BytesIO(image_content) |
39 | image =Image.open(image_file).convert(“RGB”) |
40 | filename = hashlib.sha1(image_content).hexdigest()[:10] +”.png” |
41 | file_path = output_dir / filename |
42 | image.save(file_path, “PNG”, quality=80) |
43 | |
44 | |
45 | def main(): |
46 | url =”https://your.url/here?yes=brilliant” |
47 | content =get_content_from_url(url) |
48 | image_urls =parse_image_urls( |
49 | content=content, classes=”blog-card__link”, location=”img”, source=”src”, |
50 | ) |
51 | save_urls_to_csv(image_urls) |
52 | |
53 | for image_url in image_urls: |
54 | get_and_save_image_to_file( |
55 | image_url, output_dir=pathlib.Path(“nix/path/to/test”), |
56 | ) |
57 | |
58 | |
59 | if __name__ ==”__main__”: #only executes if imported as main file |
60 | main() |