diff --git a/announce_url_title.py b/announce_url_title.py index 1b802d3..acca4aa 100644 --- a/announce_url_title.py +++ b/announce_url_title.py @@ -83,14 +83,14 @@ import weechat w = weechat import re -import htmllib +import html.parser from time import time as now from fnmatch import fnmatch -from urllib import quote +import urllib.request SCRIPT_NAME = "announce_url_title" SCRIPT_AUTHOR = "xt " -SCRIPT_VERSION = "18" +SCRIPT_VERSION = "19" SCRIPT_LICENSE = "GPL3" SCRIPT_DESC = "Announce URL titles to channel or locally" @@ -121,13 +121,71 @@ buffer_name = '' urls = {} script_nick = 'url' + + +def error_callback(*_, **__): + pass + +def is_string(data): + return isinstance(data, str) + +def is_bytes(data): + return isinstance(data, bytes) + +def to_ascii(data): + if is_string(data): + data = data.encode('ascii', errors='ignore') + elif is_bytes(data): + data = data.decode('ascii', errors='ignore') + else: + data = str(data).encode('ascii', errors='ignore') + return data + + +class Parser(html.parser.HTMLParser): + def __init__(self, url): + self.title = None + self.rec = False + html.parser.HTMLParser.__init__(self) + try: + self.feed(to_ascii(urllib.request.urlopen(url).read())) + except urllib.error.HTTPError: + return + except urllib.error.URLError: + return + except ValueError: + return + + self.rec = False + self.error = error_callback + + def handle_starttag(self, tag, attrs): + if tag == 'title': + self.rec = True + + def handle_data(self, data): + if self.rec: + self.title = data + + def handle_endtag(self, tag): + if tag == 'title': + self.rec = False + + +def get_title(url): + if Parser(url).title: + return Parser(url).title + else: + return "URL doesn’t have a title" + + def say(s, buffer=''): """normal msg""" weechat.prnt(buffer, '%s\t%s' %(script_nick, s)) def unescape(s): """Unescape HTML entities""" - p = htmllib.HTMLParser(None) + p = html.parser.HTMLParser(None) p.save_bgn() p.feed(s) return p.save_end() @@ -170,7 +228,7 @@ def url_print_cb(data, buffer, time, tags, displayed, highlight, prefix, message ignorelist = w.config_get_plugin('url_ignore').split(',') for url in urlRe.findall(message): - url = quote(url, "%/:=&?~#+!$,;@()*[]") # Escape URL + url = urllib.parse.quote(url, "%/:=&?~#+!$,;@()*[]") # Escape URL ignore = False for ignore_part in ignorelist: if ignore_part.strip(): @@ -199,59 +257,46 @@ def url_process_launcher(): if not url_d: # empty dict means not launched url_d['launched'] = now() - # Read 8192 - python2_bin = w.info_get("python2_bin", "") or "python" - cmd = python2_bin + " -c \"import urllib2; opener = urllib2.build_opener();" - cmd += "opener.addheaders = [('User-agent','%s')];" % user_agent - cmd += "print opener.open('%s').read(8192)\"" % url + title = get_title(url) url_d['stdout'] = '' - url_d['url_hook_process'] = w.hook_process(cmd, 30 * 1000, "url_process_cb", "") + url_d['url_hook_process'] = w.hook_process(title, 30 * 1000, "title_process_cb", "") return w.WEECHAT_RC_OK -def url_process_cb(data, command, rc, stdout, stderr): + +def title_process_cb(data, title, rc, stdout, stderr): """ Callback parsing html for title """ global buffer_name, urls - url = command.split("'")[-2] - if stdout != "": - urls[url]['stdout'] += stdout - if int(rc) >= 0: + max_len = int(w.config_get_plugin('title_max_length')) + if len(title) > max_len: + title = "%s [...]" % title[0:max_len] - head = re.sub("[\r\n\t ]"," ", urls[url]['stdout']) - title = re.search('(?i)\(.*?)\', head) - if title: - title = unescape(title.group(1)) - - max_len = int(w.config_get_plugin('title_max_length')) - if len(title) > max_len: - title = "%s [...]" % title[0:max_len] - - splits = buffer_name.split('.') #FIXME bad code - server = splits[0] - buffer = '.'.join(splits[1:]) - output = w.config_get_plugin('prefix') + title + w.config_get_plugin('suffix') - announce_public = w.config_get_plugin('announce_public') - if announce_public == 'on': - found = False - for active_buffer in w.config_get_plugin('buffers').split(','): - if active_buffer.lower() == buffer_name.lower(): - w.command('', '/msg -server %s %s %s' %(server, buffer, output)) - found = True - for active_buffer in w.config_get_plugin('buffers_notice').split(','): - if active_buffer.lower() == buffer_name.lower(): - w.command('', '/notice -server %s %s %s' %(server, buffer, output)) - found = True - if found == False: - say(output,w.buffer_search('', buffer_name)) - else: - say(output,w.buffer_search('', buffer_name)) - urls[url]['stdout'] = '' + splits = buffer_name.split('.') #FIXME bad code + server = splits[0] + buffer = '.'.join(splits[1:]) + output = w.config_get_plugin('prefix') + title + w.config_get_plugin('suffix') + announce_public = w.config_get_plugin('announce_public') + if announce_public == 'on': + found = False + for active_buffer in w.config_get_plugin('buffers').split(','): + if active_buffer.lower() == buffer_name.lower(): + w.command('', '/msg -server %s %s %s' %(server, buffer, output)) + found = True + for active_buffer in w.config_get_plugin('buffers_notice').split(','): + if active_buffer.lower() == buffer_name.lower(): + w.command('', '/notice -server %s %s %s' %(server, buffer, output)) + found = True + if found == False: + say(output,w.buffer_search('', buffer_name)) + else: + say(output,w.buffer_search('', buffer_name)) return w.WEECHAT_RC_OK + def purge_cb(*args): ''' Purge the url list on configured intervals ''' @@ -299,7 +344,7 @@ if __name__ == "__main__": SCRIPT_DESC, "", ""): # Set default settings - for option, default_value in settings.iteritems(): + for option, default_value in settings.items(): if not w.config_is_set_plugin(option): w.config_set_plugin(option, default_value) ignore_buffers = Ignores('ignore_buffers')