나는 치료를 위해 일반 스크레이퍼를 구성하려고 시도했다. 아이디어는 URL을 입력으로 받아 해당 URL의 페이지 만 긁어 내야하지만, YouTube 등에서 사이트를 벗어나는 것처럼 보입니다. 이상적으로는 1,2를 허용하는 깊이 옵션도 있습니다. , 3, 등등. 이것을 달성하는 방법에 대한 아이디어가 있습니까?치료 일반 스크레이퍼
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib
from route import urls
import pickle
import os
import urllib2
import urlparse
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def text_from_html(body):
soup = BeautifulSoup(body, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
def getAllUrl(url):
try:
page = urllib2.urlopen(url).read()
except:
return []
urlList = []
try:
soup = BeautifulSoup(page)
soup.prettify()
for anchor in soup.findAll('a', href=True):
if not 'http://' in anchor['href']:
if urlparse.urljoin(url, anchor['href']) not in urlList:
urlList.append(urlparse.urljoin(url, anchor['href']))
else:
if anchor['href'] not in urlList:
urlList.append(anchor['href'])
length = len(urlList)
return urlList
except urllib2.HTTPError, e:
print e
def listAllUrl(url):
urls_new = list(set(url))
return urls_new
count = 0
main_url = str(raw_input('Enter the url : '))
url_split=main_url.split('.',1)
folder_name =url_split[1]
txtfile_split = folder_name.split('.',1)
txtfile_name = txtfile_split[0]
url = getAllUrl(main_url)
urls_new = listAllUrl(url)
os.makedirs('c:/Scrapy/Extracted/'+folder_name+"/")
for url in urls_new:
if url.startswith("http") or url.startswith(" "):
if(main_url == url):
url = url
else:
pass
else:
url = main_url+url
if '#' in url:
new_url = str(url).replace('#','/')
else:
new_url =url
count = count+1
if new_url:
print""+str(count)+">>",new_url
html = urllib.urlopen(new_url).read()
page_text_data=text_from_html(html)
with open("c:/Scrapy/Extracted/"+folder_name+"/"+txtfile_name+".txt", "a") as myfile:
myfile.writelines("\n\n"+new_url.encode('utf-8')+"\n\n"+page_text_data.encode('utf-8'))
path ='c:/Scrapy/Extracted/'+folder_name+"/"
filename ="url"+str(count)+".txt"
with open(os.path.join(path, filename), 'wb') as temp_file:
temp_file.write(page_text_data.encode('utf-8'))
temp_file.close()
else:
pass