Make link URL matching smarter

Ignore-this: bff39c43073f3576e31bda52b7383e6b
- Link URL matching no longer requires the URL to be at the
  beginningfront of the line
- Includes test case and documentation fix.

darcs-hash:20101227093449-82ea9-727739b99ac86256216350b90dc3e710ea181f51.gz
This commit is contained in:
Richard Darst 2010-12-27 01:34:49 -08:00
parent 11dcb55c4b
commit a7c62223ea
5 changed files with 75 additions and 27 deletions

View File

@ -169,21 +169,19 @@ adjust and have MeetBot re-process the logs later.
release. release.
#link #link
Add a link to the minutes. The URL must be the first thing on the
line, separated by a space from the rest of the line, and it will be Add a link to the minutes. The URL will be properly detected within
properly hyperlinked. This command is automatically detected if the line the line in most cases - the URL can't contain spaces. This command
starts with http:, https:, mailto:, and some other common protocols is automatically detected if the line starts with http:, https:,
defined in the ``UrlProtocols`` configuration variable. Examples:: mailto:, and some other common protocols defined in the
``UrlProtocols`` configuration variable. Examples::
< MrBeige> #link http://wiki.debian.org/MeetBot/ is the main page < MrBeige> #link http://wiki.debian.org/MeetBot/ is the main page
< MrBeige> http://wiki.debian.org/MeetBot/ is the main page < MrBeige> http://wiki.debian.org/MeetBot/ is the main page
Both of these two examples are equivalent, and will hyperlink
properly. The first example below won't hyperlink properly, the
second one won't be automatically detected::
< MrBeige> #link the main page is http://wiki.debian.org/MeetBot/ < MrBeige> #link the main page is http://wiki.debian.org/MeetBot/
< MrBeige> the main page is http://wiki.debian.org/MeetBot/ so go there
< MrBeige> the main page is http://wiki.debian.org/MeetBot/ so go
there. (This will NOT be detected automatically)

View File

@ -30,6 +30,7 @@
### ###
import os import os
import re
import time import time
import writers import writers
@ -73,10 +74,9 @@ class _BaseItem(object):
replacements[name] = getattr(self, name) replacements[name] = getattr(self, name)
replacements['nick'] = escapewith(replacements['nick']) replacements['nick'] = escapewith(replacements['nick'])
replacements['link'] = self.logURL(M) replacements['link'] = self.logURL(M)
if 'line' in replacements: for key in ('line', 'prefix', 'suffix', 'topic'):
replacements['line'] = escapewith(replacements['line']) if key in replacements:
if 'topic' in replacements: replacements[key] = escapewith(replacements[key])
replacements['topic'] = escapewith(replacements['topic'])
if 'url' in replacements: if 'url' in replacements:
replacements['url_quoteescaped'] = \ replacements['url_quoteescaped'] = \
escapewith(self.url.replace('"', "%22")) escapewith(self.url.replace('"', "%22"))
@ -228,24 +228,40 @@ class Rejected(GenericItem):
class Link(_BaseItem): class Link(_BaseItem):
itemtype = 'LINK' itemtype = 'LINK'
html_template = """<tr><td><a href='%(link)s#%(anchor)s'>%(time)s</a></td> html_template = """<tr><td><a href='%(link)s#%(anchor)s'>%(time)s</a></td>
<td>%(itemtype)s</td><td>%(nick)s</td><td>%(starthtml)s<a href="%(url)s">%(url_readable)s</a> %(line)s%(endhtml)s</td> <td>%(itemtype)s</td><td>%(nick)s</td><td>%(starthtml)s%(prefix)s<a href="%(url)s">%(url_readable)s</a>%(suffix)s%(endhtml)s</td>
</tr>""" </tr>"""
#html2_template = ("""<i>%(itemtype)s</i>: %(starthtml)s<a href="%(url)s">%(url_readable)s</a> %(line)s%(endhtml)s """ html2_template = ("""%(starthtml)s%(prefix)s<a href="%(url)s">%(url_readable)s</a>%(suffix)s%(endhtml)s """
# """(%(nick)s, <a href='%(link)s#%(anchor)s'>%(time)s</a>)""")
#html2_template = ("""<i>%(itemtype)s</i>: %(starthtml)s<a href="%(url)s">%(url_readable)s</a> %(line)s%(endhtml)s """
# """(<a href='%(link)s#%(anchor)s'>%(nick)s</a>, %(time)s)""")
html2_template = ("""%(starthtml)s<a href="%(url)s">%(url_readable)s</a> %(line)s%(endhtml)s """
"""<span class="details">""" """<span class="details">"""
"""(<a href='%(link)s#%(anchor)s'>%(nick)s</a>, """ """(<a href='%(link)s#%(anchor)s'>%(nick)s</a>, """
"""%(time)s)""" """%(time)s)"""
"""</span>""") """</span>""")
rst_template = """*%(itemtype)s*: %(startrst)s%(url)s %(line)s%(endrst)s (%(rstref)s_)""" rst_template = """*%(itemtype)s*: %(startrst)s%(prefix)s%(url)s%(suffix)s%(endrst)s (%(rstref)s_)"""
text_template = """%(itemtype)s: %(starttext)s%(url)s %(line)s%(endtext)s (%(nick)s, %(time)s)""" text_template = """%(itemtype)s: %(starttext)s%(prefix)s%(url)s%(suffix)s%(endtext)s (%(nick)s, %(time)s)"""
mw_template = """''%(itemtype)s:'' %(startmw)s%(url)s %(line)s%(endmw)s (%(nick)s, %(time)s)""" mw_template = """''%(itemtype)s:'' %(startmw)s%(prefix)s%(url)s%(suffix)s%(endmw)s (%(nick)s, %(time)s)"""
def __init__(self, nick, line, linenum, time_): def __init__(self, nick, line, linenum, time_, M):
self.nick = nick ; self.linenum = linenum self.nick = nick ; self.linenum = linenum
self.time = time.strftime("%H:%M:%S", time_) self.time = time.strftime("%H:%M:%S", time_)
self.url, self.line = (line+' ').split(' ', 1) self.line = line
protocols = M.config.UrlProtocols
protocols = '|'.join(re.escape(p) for p in protocols)
protocols = '(?:'+protocols+')'
# This is gross.
# (.*?) - any prefix, non-greedy
# (%s//[^\s]+ - protocol://... until the next space
# (?<!\.|\)) - but the last character can NOT be . or )
# (.*) - any suffix
url_re = re.compile(r'(.*?)(%s//[^\s]+(?<!\.|\)))(.*)'%protocols)
m = url_re.match(line)
if m:
self.prefix = m.group(1)
self.url = m.group(2)
self.suffix = m.group(3)
else:
# simple matching, the old way.
self.url, self.suffix = (line+' ').split(' ', 1)
self.suffix = ' '+self.suffix
self.prefix = ''
# URL-sanitization # URL-sanitization
self.url_readable = self.url # readable line version self.url_readable = self.url # readable line version
self.url = self.url self.url = self.url

View File

@ -448,7 +448,7 @@ class MeetingCommands(object):
self.addnick(nick, lines=0) self.addnick(nick, lines=0)
def do_link(self, **kwargs): def do_link(self, **kwargs):
"""Add informational item to the minutes.""" """Add informational item to the minutes."""
m = items.Link(**kwargs) m = items.Link(M=self, **kwargs)
self.additem(m) self.additem(m)
def do_commands(self, **kwargs): def do_commands(self, **kwargs):
commands = [ "#"+x[3:] for x in dir(self) if x[:3]=="do_" ] commands = [ "#"+x[3:] for x in dir(self) if x[:3]=="do_" ]

View File

@ -222,6 +222,32 @@ class MeetBotTest(unittest.TestCase):
results, re.IGNORECASE), \ results, re.IGNORECASE), \
"Nick full-word matching failed" "Nick full-word matching failed"
def test_urlMatching(self):
"""Test properly detection of URLs in lines
"""
script = """
20:13:50 <x> #startmeeting
20:13:50 <x> #link prefix http://site1.com suffix
20:13:50 <x> http://site2.com suffix
20:13:50 <x> ftp://ftpsite1.com suffix
20:13:50 <x> #link prefix ftp://ftpsite2.com suffix
20:13:50 <x> irc://ircsite1.com suffix
20:13:50 <x> mailto://a@mail.com suffix
20:13:50 <x> #endmeeting
"""
M = process_meeting(script)
results = M.save()['.html']
assert re.search(r'prefix.*href.*http://site1.com.*suffix',
results), "URL missing 1"
assert re.search(r'href.*http://site2.com.*suffix',
results), "URL missing 2"
assert re.search(r'href.*ftp://ftpsite1.com.*suffix',
results), "URL missing 3"
assert re.search(r'prefix.*href.*ftp://ftpsite2.com.*suffix',
results), "URL missing 4"
assert re.search(r'href.*mailto://a@mail.com.*suffix',
results), "URL missing 5"
def t_css(self): def t_css(self):
"""Runs all CSS-related tests. """Runs all CSS-related tests.
""" """

View File

@ -64,7 +64,15 @@
# links # links
20:13:50 <MrBeige> #topic Links 20:13:50 <MrBeige> #topic Links
20:13:50 <Utahraptor> #link http://test<b>.zgib.net 20:13:50 <Utahraptor> #link http://test<b>.zgib.net
20:13:50 <Utahraptor> #link ftp://test<b>.zgib.net "
20:13:50 <Utahraptor> #link mailto://a@bla"h.com
20:13:50 <Utahraptor> #link http://test.zgib.net/&testpage 20:13:50 <Utahraptor> #link http://test.zgib.net/&testpage
20:13:50 <Utahraptor> #link prefix http://test.zgib.net/&testpage suffix
20:13:50 <Utahraptor> #link prefix ftp://test.zg"ib.net/&testpage suffix
20:13:50 <Utahraptor> #link prefix mailto://a@blah.com&testpage suffix
20:13:50 <Utahraptor> #link prefix http://google.com/. suffix
20:13:50 <Utahraptor> #link prefix (http://google.com/) suffix
# accents # accents
20:13:50 <MrBeige> #topic Character sets 20:13:50 <MrBeige> #topic Character sets