对某资料网站的爬取

访问: 7 次

前言

冰总群里有人发了个资料网站,于是乎想顺手练一下python

代码

# -*- coding: utf-8 -*-
# @Author: Decade
# @Date:   2019-02-20 23:12:00
# @Last Modified by:   Marte
# @Last Modified time: 2019-02-21 01:20:40

import requests
import os
from urllib.parse import unquote
from bs4 import BeautifulSoup

def get_url(url):
    r=requests.get(url)
    soup =BeautifulSoup(r.text,'lxml')
    list=[]
    for a in soup.find_all(name='a'):
        b=a.attrs['href']
        #print(b)
        if b.split('/')[-1]=="":
            if not b.endswith("../"):
                list.append(url+b)
    return list
def get_pdf(url):
    r=requests.get(url)
    soup =BeautifulSoup(r.text,'lxml')
    list=[]
    for a in soup.find_all(name='a'):
        new_url=url+a.attrs['href']
        if new_url.endswith(".pdf"):
            list.append(url+a.attrs['href'])
    return list
def down_file(List):
    for llist in List:
        if os.path.isdir(llist.split('/')[-2]):
            pass
        else:
            os.mkdir(llist.split('/')[-2])
        r = requests.get(llist, stream=True)
        localname=llist.split('/')[-2]+'/'+unquote(llist.split('/')[-1],'utf-8')
        with open(localname,'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    f.flush()
        f.close()
        print(localname+" Download success ")

root_url="https://paper.bobylive.com/"
url_list=[]
url_list1=get_url(root_url)
url_list2=[]
url_list3=[]
for url_List in url_list1[1:]:
    url_list2+=get_url(url_List)
for url_List in url_list2[1:]:
    url_list3+=get_url(url_List)
url_list=url_list1+url_list2+url_list3
pdf_list=[]
for url in url_list:
    pdf_list+=get_pdf(url)
down_file(pdf_list)

wget

wget -r -p -np -k -E https://paper.bobylive.com
tag(s): 爬虫
show comments · back · home
署名-非商业性使用-禁止演绎 4.0 国际 转载请保留原文链接及作者。
Edit with markdown