##############################################################################
# Module : urlfinder.py
# Author :
# Date   :
# Updated and modified for ABC_OKC : Old King Cole
#
# Description : Torrent URL finder
#
##############################################################################
import re, urlparse
from HTMLParser import HTMLParser
from threading import Thread

from BitTornado.zurllib import urlopen

class UrlFinder(HTMLParser):
    def __init__(self, link):
        HTMLParser.__init__(self)
        self.allLinks = []
        self.link = link
        self.keywords = ("/get", "download")

    def parse_starttag(self, i):
        # For fixing broken end tags
        tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
        attrfind = re.compile(
            r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
            r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')

        self.__starttag_text = None
        endpos = self.check_for_whole_start_tag(i)
        if endpos < 0:
            return endpos
        rawdata = self.rawdata
        self.__starttag_text = rawdata[i:endpos]

        # Now parse the data between i+1 and j into a tag and attrs
        attrs = []
        match = tagfind.match(rawdata, i + 1)
        assert match, 'unexpected call to parse_starttag()'
        k = match.end()
        self.lasttag = tag = rawdata[i + 1:k].lower()

        while k < endpos:
            m = attrfind.match(rawdata, k)
            if not m:
                break
            attrname, rest, attrvalue = m.group(1, 2, 3)
            if not rest:
                attrvalue = None
            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                 attrvalue[:1] == '"' == attrvalue[-1:]:
                attrvalue = attrvalue[1:-1]
            #############################
            # Gives error if what's following '&' contains encoded unicode
            # Moreover unescape returns unicode
            #     attrvalue = self.unescape(attrvalue)
            # if attrvalue:
            #############################
                try:
                    attrvalue = attrvalue.decode('utf_8')
                except:
                    k = m.end()
                    continue
                attrvalue = self.unescape(attrvalue)
            elif attrvalue:
            #############################
                try:
                    attrvalue = attrvalue.decode('utf_8')
                except:
                    k = m.end()
                    continue
            attrs.append((attrname.lower(), attrvalue))
            k = m.end()

        end = rawdata[k:endpos].strip()
        if end not in (">", "/>"):
            lineno, offset = self.getpos()
            if "\n" in self.__starttag_text:
                lineno = lineno + self.__starttag_text.count("\n")
                offset = len(self.__starttag_text) \
                         - self.__starttag_text.rfind("\n")
            else:
                offset = offset + len(self.__starttag_text)
            #self.error("junk characters in start tag: %r"
             #          % (rawdata[k:endpos][:20],))
        if end.endswith('/>'):
            # XHTML-style empty tag: <span attr="value" />
            self.handle_startendtag(tag, attrs)
        else:
            self.handle_starttag(tag, attrs)
            if tag in self.CDATA_CONTENT_ELEMENTS:
                self.set_cdata_mode(tag)
        return endpos

    def check_for_whole_start_tag(self, i):
        locatestarttagend = re.compile(r"""
          <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
          (?:\s+                             # whitespace before attribute name
            (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
              (?:\s*=\s*                     # value indicator
                (?:'[^']*'                   # LITA-enclosed value
                  |\"[^\"]*\"                # LIT-enclosed value
                  |[^'\">\s]+                # bare value
                 )
               )?
             )
           )*
          \s*                                # trailing whitespace
        """, re.VERBOSE)

        rawdata = self.rawdata
        m = locatestarttagend.match(rawdata, i)
        if m:
            j = m.end()
            next = rawdata[j:j + 1]
            if next == ">":
                return j + 1
            if next == "/":
                if rawdata.startswith("/>", j):
                    return j + 2
                if rawdata.startswith("/", j):
                    # Buffer boundary
                    return -1
                # Else bogus input
                self.updatepos(i, j + 1)
                #print "malformed empty start tag"
            if next == "":
                # End of input
                return -1
            if next in ("abcdefghijklmnopqrstuvwxyz=/"
                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
                # End of input in or before attribute value, or we have the
                # '/' from a '/>' ending
                return -1
            self.updatepos(i, j)
            #print "malformed start tag"
            return j + 1
        raise AssertionError("we should not get here!")

    def parse_endtag(self, i):
        error = False
        endendtag = re.compile('>')
        endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
        rawdata = self.rawdata
        assert rawdata[i:i + 2] == "</", "unexpected call to parse_endtag"
        match = endendtag.search(rawdata, i + 1) # >
        if not match:
            return -1
        j = match.end()
        match = endtagfind.match(rawdata, i) # </ + tag + >
        #if not match:
        #    print "Error: bad end tag: %r" % (rawdata[i:j],)
        try:
            tag = match.group(1)
        except:
            tag = ""
        self.handle_endtag(tag.lower())
        self.clear_cdata_mode()
        return j
        
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs) # Convert from tuple of tuples to dict
        if 'a' == tag and 'href' in attrs and attrs['href']:
            if attrs['href'].startswith(("http://", "https://")):
                if self.keywords[0] in attrs['href'] or self.keywords[1] in attrs['href']:
                    self.allLinks.insert(0, attrs['href'])
                else:
                    self.allLinks.append(attrs['href'])
            else:
                _url = urlparse.urljoin(self.link, attrs['href'])
                if self.keywords[0] in attrs['href'] or self.keywords[1] in attrs['href']:
                    self.allLinks.insert(0, _url)
                else:
                    self.allLinks.append(_url)


class ParserThread(Thread):
    def __init__(self, utility, parser, url, key, articles, rssrule = None, fastscan = True, cookies = None, pagecontent = None):
        Thread.__init__(self)
        self.utility = utility
        self.localize = self.utility.lang.get
        self.window = utility.window
        self.rsspanel = self.window.rsspanel
        self.list = self.rsspanel.list
        self.invokeLater = self.rsspanel.invokeLater
        self.parser = parser
        self.desc5 = url
        self.key = key
        self.articles = articles
        self.rssrule = rssrule
        self.daemon = True
        self.torrentlinks = []
        self.fastscan = fastscan
        self.stopall = False
        self.cookies = cookies
        self.pagecontent = pagecontent

    def StopAll(self):
        self.stopall = True

    def run(self):
        self.GetTheData()

    def addParserTorrentURLFromRSSCallback(self, url, status):
        if status == "OK":
            # Add sub url to history and log files
            self.rsspanel.saveurls |= self.rsspanel.AddToHistory(url)
            # Add main url to history file
            self.rsspanel.saveurls |= self.rsspanel.AddToHistory(self.desc5, tolog = False)
            # Change color to "downloaded" = Green
            for a in self.articles:
                self.invokeLater(self.list.setStatus, [a, 2])
            self.window.parent.SetStatusText(self.localize('successfuldownloadfrom') + " %s" % self.desc5)
        else:
            # Change color to "Fail" = Red
            self.rsspanel.AddToLog(status, self.desc5, "error")
            for a in self.articles:
                self.invokeLater(self.list.setStatus, [a, 3])
            self.window.parent.SetStatusText(self.localize('couldntgettorrent') + " %s" % self.desc5)
        self.rsspanel.nbrssaddtorrent -= 1
        if self.rsspanel.nbrssaddtorrent == 0 and self.rsspanel.saveurls:
            self.rsspanel.SaveURLs()
            self.rsspanel.saveurls = False

    def GetTheData(self):
        #try:
        #    self.desc5 = str(self.desc5)
        #except:
        #    pass

        self.window.parent.SetStatusText(self.localize('searchingin') + " %s" % self.desc5)
        self.rsspanel.AddToLog(self.localize('log_tryingtodownloadfrom'), self.desc5, "info")

        # If user called stop : Stop!
        if self.stopall:
            self.OnStopScan()
            return

        headers = {}
        if self.cookies is not None:
            headers['Cookie'] = self.cookies

        if self.pagecontent is None:
            try:
                self.parser.feed(urlopen(self.desc5, headers).read())
            except IOError:
                pass
        else:
            self.parser.feed(self.pagecontent)

        # If user called stop : Stop!
        if self.stopall:
            self.OnStopScan()
            return

        # Check to see any direct .torrent links
        for link in self.parser.allLinks:
            if self.stopall:
                break
            if link.lower().endswith(".torrent") or link.lower().startswith("magnet:"):
                self.torrentlinks.append(link)
                if self.fastscan:
                    break

        # If user called stop : Stop!
        if self.stopall:
            self.OnStopScan()
            return

        # If there aren't any check the Content-Type
        if not self.torrentlinks:
            all = len(self.parser.allLinks)
            now = 1
            for link in self.parser.allLinks:
                if self.stopall:
                    break
                try:
                    url = urlopen(link, headers)
                    if "application/x-bittorrent" in url.response.getheader("Content-Type"):
                        self.torrentlinks.append(link)
                        if self.fastscan:
                            break
                except:
                    self.window.parent.SetStatusText(self.localize('cantgetinfofor') + " %s" % link)
                    self.rsspanel.AddToLog(self.localize('log_cantgetinfofor'), link, "error")
                    pass
                self.window.parent.SetStatusText("[" + str(now) + "/" + str(all) + "]" + self.localize('searchingin') + " %s" % self.desc5) 
                now += 1

        # If user called stop : Stop!
        if self.stopall:
            self.OnStopScan()
            return

        if not self.torrentlinks:
            self.window.parent.SetStatusText(self.localize('log_notorrents') + " %s" % self.desc5)
            self.rsspanel.AddToLog(self.localize('log_notorrents'), self.desc5, "error")
            # Change color to "Fail" = Red
            for a in self.articles:
                self.invokeLater(self.list.setStatus, [a, 3])
        else:
            self.window.parent.SetStatusText(self.localize('downloadingtorrentsfrom') + " %s" % self.desc5)
            if self.rssrule is None:
                caller = ''
            else:
                caller = 'rss'
            for url in self.torrentlinks:
                self.rsspanel.nbrssaddtorrent += 1
                self.rsspanel.AddTorrent(self.addParserTorrentURLFromRSSCallback, url, caller = caller, rssrule = self.rssrule,
                                         headers = headers, callbackinfo = url)

        if self.rsspanel.parserthreads.has_key(self.desc5):
            del self.rsspanel.parserthreads[self.desc5]

    def OnStopScan(self):
        # If user called for stop : return
        if self.stopall:
            # Change color to "Fail" = Red
            for a in self.articles:
                self.invokeLater(self.list.setStatus, [a, 3])
            self.window.parent.SetStatusText(self.localize('scanningurlstopped') + " %s" % self.desc5)
            self.rsspanel.AddToLog(self.localize('scanningurlstopped'), self.desc5, "error")
            if self.rsspanel.parserthreads.has_key(self.desc5):
                del self.rsspanel.parserthreads[self.desc5]
