import re
import sys
import time
import math
import urllib2
import urlparse
import optparse
import hashlib
import socket
from string import *
from cgi import escape
from traceback import format_exc
from Queue import Queue, Empty as QueueEmpty
from BeautifulSoup import BeautifulSoup
AGENT ="%s %s" % ("deneme", "agent")
def getir_butunLinkler(url):
page = Getir(url)
page.getirgetir()
for i, url in enumerate(page):
print "%d. %s" % (i, url)
"""
getir_butunLinkler("http://www.eciftim.com")
0. http://www.eciftim.com/
1. http://www.eciftim.com/kategori/
2. http://www.eciftim.com/xkategory/sonilanlar/
3. http://www.eciftim.com/login/signup/
4. http://www.eciftim.com/login/
5. http://www.eciftim.com/xkategory/xkategory/es/gunluk-esya/
6. http://www.eciftim.com/xkategory/xkategory/es/ev-esya/
7. http://www.eciftim.com/xkategory/xkategory/es/gunluk-seyler/
8. http://www.eciftim.com/xkategory/xkategory/es/elktronik-esya/
9. http://www.eciftim.com/xkategory/xkategory/es/pazarlama/
10. http://www.eciftim.com/xkategory/xkategory/es/kelepir/
11. http://www.eciftim.com
12. http://www.eciftim.com/ilan/7/ajax/
13. http://www.eciftim.com/ilan/6/ajax/
14. http://www.eciftim.com/index.html
15. http://www.eciftim.com/hakkimizda.html
16. http://www.eciftim.com/yardim.html
17. http://www.eciftim.com/guvenlik.html
18. http://www.eciftim.com/nasilcalisir.html
19. http://www.eciftim.com/kullanimkosulu.html
20. http://www.eciftim.com/sirketlericin.html
21. http://www.eciftim.com/gizlilik.html
"""
class Link (object):
def __init__(self, src, dst, link_type):
self.src = src
self.dst = dst
self.link_type = link_type
def __hash__(self):
return hash((self.src, self.dst, self.link_type))
def __eq__(self, other):
return (self.src == other.src and
self.dst == other.dst and
self.link_type == other.link_type)
def __str__(self):
return self.src + " -> " + self.dst
class DataIstisna (Exception):
def __init__(self, message, mimetype, url):
Exception.__init__(self, message)
self.mimetype=mimetype
self.url=url
class Getir(object):
def __init__(self, url):
self.url = url
self.out_urls = []
self.encoding = ""
self.content = ""
socket.setdefaulttimeout(2)
def __getitem__(self, x):
return self.out_urls[x]
def out_links(self):
return self.out_urls
def _addHeaders(self, request):
request.add_header("User-Agent", AGENT)
def _open(self):
url = self.url
try:
request = urllib2.Request(url)
handle = urllib2.build_opener()
except IOError:
return None
return (request, handle)
def getirgetir(self):
request, handle = self._open()
self._addHeaders(request)
if handle:
try:
data=handle.open(request)
mime_type=data.info().gettype()
url=data.geturl();
if mime_type != "text/html":
raise DataIstisna("bir sey yok %s" % mime_type,
mime_type, url)
self.content = unicode(data.read(), "utf-8",
errors="replace")
soup = BeautifulSoup(self.content)
self.encoding = "utf-8"
tags = soup('a')
except urllib2.HTTPError, error:
if error.code == 404:
print >> sys.stderr, "ERROR: %s -> %s" % (error, error.url)
else:
print >> sys.stderr, "ERROR: %s" % error
tags = []
except urllib2.URLError, error:
print >> sys.stderr, "ERROR: %s" % error
tags = []
except DataIstisna, error:
print >>sys.stderr, "atlaniyor %s, e %s" % (error.url, error.mimetype)
tags = []
for tag in tags:
href = tag.get("href")
if href is not None:
url = urlparse.urljoin(self.url.encode(self.encoding), escape(href))
if url not in self:
self.out_urls.append(url.encode(self.encoding)) web sitenizde kac tane link var bulun(kod python)
0
●924
- 18-08-2012, 08:52:50