Adapt to py3

2020-07-13 01:05:17 +02:00 · 2020-07-13 01:05:17 +02:00 · 39d5f5762c
parent 342248b05a
commit 39d5f5762c
1 changed files with 91 additions and 46 deletions
--- a/announce_url_title.py
+++ b/announce_url_title.py
@ -83,14 +83,14 @@
 import weechat
 w = weechat
 import re
-import htmllib
+import html.parser
 from time import time as now
 from fnmatch import fnmatch
-from urllib import quote
+import urllib.request

 SCRIPT_NAME    = "announce_url_title"
 SCRIPT_AUTHOR  = "xt <xt@bash.no>"
-SCRIPT_VERSION = "18"
+SCRIPT_VERSION = "19"
 SCRIPT_LICENSE = "GPL3"
 SCRIPT_DESC    = "Announce URL titles to channel or locally"

@ -121,13 +121,71 @@ buffer_name = ''

 urls = {}
 script_nick = 'url'
+
+
+def error_callback(*_, **__):
+    pass
+
+def is_string(data):
+    return isinstance(data, str)
+
+def is_bytes(data):
+    return isinstance(data, bytes)
+
+def to_ascii(data):
+    if is_string(data):
+        data = data.encode('ascii', errors='ignore')
+    elif is_bytes(data):
+        data = data.decode('ascii', errors='ignore')
+    else:
+        data = str(data).encode('ascii', errors='ignore')
+    return data
+
+
+class Parser(html.parser.HTMLParser):
+    def __init__(self, url):
+        self.title = None
+        self.rec = False
+        html.parser.HTMLParser.__init__(self)
+        try:
+            self.feed(to_ascii(urllib.request.urlopen(url).read()))
+        except urllib.error.HTTPError:
+            return
+        except urllib.error.URLError:
+            return
+        except ValueError:
+            return
+
+        self.rec = False
+        self.error = error_callback
+
+    def handle_starttag(self, tag, attrs):
+        if tag == 'title':
+            self.rec = True
+
+    def handle_data(self, data):
+        if self.rec:
+            self.title = data
+
+    def handle_endtag(self, tag):
+        if tag == 'title':
+            self.rec = False
+
+
+def get_title(url):
+    if Parser(url).title:
+        return Parser(url).title
+    else:
+        return "URL doesn’t have a title"
+
+
 def say(s, buffer=''):
    """normal msg"""
    weechat.prnt(buffer, '%s\t%s' %(script_nick, s))

 def unescape(s):
    """Unescape HTML entities"""
-    p = htmllib.HTMLParser(None)
+    p = html.parser.HTMLParser(None)
    p.save_bgn()
    p.feed(s)
    return p.save_end()
@ -170,7 +228,7 @@ def url_print_cb(data, buffer, time, tags, displayed, highlight, prefix, message
    ignorelist = w.config_get_plugin('url_ignore').split(',')
    for url in urlRe.findall(message):

-        url = quote(url, "%/:=&?~#+!$,;@()*[]") # Escape URL
+        url = urllib.parse.quote(url, "%/:=&?~#+!$,;@()*[]") # Escape URL
        ignore = False
        for ignore_part in ignorelist:
            if ignore_part.strip():
@ -199,59 +257,46 @@ def url_process_launcher():
        if not url_d: # empty dict means not launched
            url_d['launched'] = now()

-            # Read 8192
-            python2_bin = w.info_get("python2_bin", "") or "python"
-            cmd = python2_bin + " -c \"import urllib2; opener = urllib2.build_opener();"
-            cmd += "opener.addheaders = [('User-agent','%s')];" % user_agent
-            cmd += "print opener.open('%s').read(8192)\"" % url
+            title = get_title(url)

            url_d['stdout'] = ''
-            url_d['url_hook_process'] = w.hook_process(cmd, 30 * 1000, "url_process_cb", "")
+            url_d['url_hook_process'] = w.hook_process(title, 30 * 1000, "title_process_cb", "")

    return w.WEECHAT_RC_OK

-def url_process_cb(data, command, rc, stdout, stderr):
+
+def title_process_cb(data, title, rc, stdout, stderr):
    """ Callback parsing html for title """

    global buffer_name, urls

-    url = command.split("'")[-2]
-    if stdout != "":
-        urls[url]['stdout'] += stdout
-    if int(rc) >= 0:
+    max_len = int(w.config_get_plugin('title_max_length'))
+    if len(title) > max_len:
+        title = "%s [...]" % title[0:max_len]

-        head = re.sub("[\r\n\t ]"," ", urls[url]['stdout'])
-        title = re.search('(?i)\<title\>(.*?)\</title\>', head)
-        if title:
-            title = unescape(title.group(1))
-
-            max_len = int(w.config_get_plugin('title_max_length'))
-            if len(title) > max_len:
-                title = "%s [...]" % title[0:max_len]
-
-            splits = buffer_name.split('.') #FIXME bad code
-            server = splits[0]
-            buffer = '.'.join(splits[1:])
-            output = w.config_get_plugin('prefix') + title + w.config_get_plugin('suffix')
-            announce_public = w.config_get_plugin('announce_public')
-            if announce_public == 'on':
-                found = False
-                for active_buffer in w.config_get_plugin('buffers').split(','):
-                    if active_buffer.lower() == buffer_name.lower():
-                        w.command('', '/msg -server %s %s %s' %(server, buffer, output))
-                        found = True
-                for active_buffer in w.config_get_plugin('buffers_notice').split(','):
-                    if active_buffer.lower() == buffer_name.lower():
-                        w.command('', '/notice -server %s %s %s' %(server, buffer, output))
-                        found = True
-                if found == False:
-                    say(output,w.buffer_search('', buffer_name))
-            else:
-                say(output,w.buffer_search('', buffer_name))
-        urls[url]['stdout'] = ''
+    splits = buffer_name.split('.') #FIXME bad code
+    server = splits[0]
+    buffer = '.'.join(splits[1:])
+    output = w.config_get_plugin('prefix') + title + w.config_get_plugin('suffix')
+    announce_public = w.config_get_plugin('announce_public')
+    if announce_public == 'on':
+        found = False
+        for active_buffer in w.config_get_plugin('buffers').split(','):
+            if active_buffer.lower() == buffer_name.lower():
+                w.command('', '/msg -server %s %s %s' %(server, buffer, output))
+                found = True
+        for active_buffer in w.config_get_plugin('buffers_notice').split(','):
+            if active_buffer.lower() == buffer_name.lower():
+                w.command('', '/notice -server %s %s %s' %(server, buffer, output))
+                found = True
+        if found == False:
+            say(output,w.buffer_search('', buffer_name))
+    else:
+        say(output,w.buffer_search('', buffer_name))

    return w.WEECHAT_RC_OK

+
 def purge_cb(*args):
    ''' Purge the url list on configured intervals '''

@ -299,7 +344,7 @@ if __name__ == "__main__":
                        SCRIPT_DESC, "", ""):

        # Set default settings
-        for option, default_value in settings.iteritems():
+        for option, default_value in settings.items():
            if not w.config_is_set_plugin(option):
                w.config_set_plugin(option, default_value)
        ignore_buffers = Ignores('ignore_buffers')