Make link URL matching smarter

Ignore-this: bff39c43073f3576e31bda52b7383e6b - Link URL matching no longer requires the URL to be at the beginningfront of the line - Includes test case and documentation fix. darcs-hash:20101227093449-82ea9-727739b99ac86256216350b90dc3e710ea181f51.gz
2010-12-27 01:34:49 -08:00 · 2010-12-27 01:34:49 -08:00 · a7c62223ea
commit a7c62223ea
parent 11dcb55c4b
5 changed files with 75 additions and 27 deletions
--- a/doc/Manual.txt
+++ b/doc/Manual.txt
@ -169,21 +169,19 @@ adjust and have MeetBot re-process the logs later.
               release.
 #link
-  Add a link to the minutes.  The URL must be the first thing on the
+
-  line, separated by a space from the rest of the line, and it will be
+  Add a link to the minutes.  The URL will be properly detected within
-  properly hyperlinked.  This command is automatically detected if the line
+  the line in most cases - the URL can't contain spaces.  This command
-  starts with http:, https:, mailto:, and some other common protocols
+  is automatically detected if the line starts with http:, https:,
-  defined in the ``UrlProtocols`` configuration variable.  Examples::
+  mailto:, and some other common protocols defined in the
  ``UrlProtocols`` configuration variable.  Examples::
    < MrBeige> #link http://wiki.debian.org/MeetBot/ is the main page
    < MrBeige> http://wiki.debian.org/MeetBot/ is the main page
  Both of these two examples are equivalent, and will hyperlink
  properly.  The first example below won't hyperlink properly, the
  second one won't be automatically detected::
    < MrBeige> #link the main page is http://wiki.debian.org/MeetBot/
-    < MrBeige> the main page is http://wiki.debian.org/MeetBot/
+               so go there
    < MrBeige> the main page is http://wiki.debian.org/MeetBot/ so go
               there.  (This will NOT be detected automatically)
--- a/ircmeeting/items.py
+++ b/ircmeeting/items.py
@ -30,6 +30,7 @@
 ###
 import os
 import re
 import time
 import writers
@ -73,10 +74,9 @@ class _BaseItem(object):
            replacements[name] = getattr(self, name)
        replacements['nick'] = escapewith(replacements['nick'])
        replacements['link'] = self.logURL(M)
-        if 'line' in replacements:
+        for key in ('line', 'prefix', 'suffix', 'topic'):
-            replacements['line'] = escapewith(replacements['line'])
+            if key in replacements:
-        if 'topic' in replacements:
+                replacements[key] = escapewith(replacements[key])
            replacements['topic'] = escapewith(replacements['topic'])
        if 'url' in replacements:
            replacements['url_quoteescaped'] = \
                                      escapewith(self.url.replace('"', "%22"))
@ -228,24 +228,40 @@ class Rejected(GenericItem):
 class Link(_BaseItem):
    itemtype = 'LINK'
    html_template = """<tr><td><a href='%(link)s#%(anchor)s'>%(time)s</a></td>
-        <td>%(itemtype)s</td><td>%(nick)s</td><td>%(starthtml)s<a href="%(url)s">%(url_readable)s</a> %(line)s%(endhtml)s</td>
+        <td>%(itemtype)s</td><td>%(nick)s</td><td>%(starthtml)s%(prefix)s<a href="%(url)s">%(url_readable)s</a>%(suffix)s%(endhtml)s</td>
        </tr>"""
-    #html2_template = ("""<i>%(itemtype)s</i>: %(starthtml)s<a href="%(url)s">%(url_readable)s</a> %(line)s%(endhtml)s """
+    html2_template = ("""%(starthtml)s%(prefix)s<a href="%(url)s">%(url_readable)s</a>%(suffix)s%(endhtml)s """
    #                  """(%(nick)s, <a href='%(link)s#%(anchor)s'>%(time)s</a>)""")
    #html2_template = ("""<i>%(itemtype)s</i>: %(starthtml)s<a href="%(url)s">%(url_readable)s</a> %(line)s%(endhtml)s """
    #                  """(<a href='%(link)s#%(anchor)s'>%(nick)s</a>, %(time)s)""")
    html2_template = ("""%(starthtml)s<a href="%(url)s">%(url_readable)s</a> %(line)s%(endhtml)s """
                      """<span class="details">"""
                      """(<a href='%(link)s#%(anchor)s'>%(nick)s</a>, """
                      """%(time)s)"""
                      """</span>""")
-    rst_template = """*%(itemtype)s*: %(startrst)s%(url)s %(line)s%(endrst)s  (%(rstref)s_)"""
+    rst_template = """*%(itemtype)s*: %(startrst)s%(prefix)s%(url)s%(suffix)s%(endrst)s  (%(rstref)s_)"""
-    text_template = """%(itemtype)s: %(starttext)s%(url)s %(line)s%(endtext)s  (%(nick)s, %(time)s)"""
+    text_template = """%(itemtype)s: %(starttext)s%(prefix)s%(url)s%(suffix)s%(endtext)s  (%(nick)s, %(time)s)"""
-    mw_template = """''%(itemtype)s:'' %(startmw)s%(url)s %(line)s%(endmw)s  (%(nick)s, %(time)s)"""
+    mw_template = """''%(itemtype)s:'' %(startmw)s%(prefix)s%(url)s%(suffix)s%(endmw)s  (%(nick)s, %(time)s)"""
-    def __init__(self, nick, line, linenum, time_):
+    def __init__(self, nick, line, linenum, time_, M):
        self.nick = nick ; self.linenum = linenum
        self.time = time.strftime("%H:%M:%S", time_)
-        self.url, self.line = (line+' ').split(' ', 1)
+        self.line = line
        protocols = M.config.UrlProtocols
        protocols = '|'.join(re.escape(p) for p in protocols)
        protocols = '(?:'+protocols+')'
        # This is gross.
        # (.*?)          - any prefix, non-greedy
        # (%s//[^\s]+    - protocol://... until the next space
        # (?<!\.|\))     - but the last character can NOT be . or )
        # (.*)           - any suffix
        url_re = re.compile(r'(.*?)(%s//[^\s]+(?<!\.|\)))(.*)'%protocols)
        m = url_re.match(line)
        if m:
            self.prefix = m.group(1)
            self.url    = m.group(2)
            self.suffix = m.group(3)
        else:
            # simple matching, the old way.
            self.url, self.suffix = (line+' ').split(' ', 1)
            self.suffix = ' '+self.suffix
            self.prefix = ''
        # URL-sanitization
        self.url_readable = self.url # readable line version
        self.url = self.url
--- a/ircmeeting/meeting.py
+++ b/ircmeeting/meeting.py
@ -448,7 +448,7 @@ class MeetingCommands(object):
            self.addnick(nick, lines=0)
    def do_link(self, **kwargs):
        """Add informational item to the minutes."""
-        m = items.Link(**kwargs)
+        m = items.Link(M=self, **kwargs)
        self.additem(m)
    def do_commands(self, **kwargs):
        commands = [ "#"+x[3:] for x in dir(self) if x[:3]=="do_" ]
--- a/tests/run_test.py
+++ b/tests/run_test.py
@ -222,6 +222,32 @@ class MeetBotTest(unittest.TestCase):
                         results, re.IGNORECASE), \
                         "Nick full-word matching failed"
    def test_urlMatching(self):
        """Test properly detection of URLs in lines
        """
        script = """
        20:13:50 <x> #startmeeting
        20:13:50 <x> #link prefix http://site1.com suffix
        20:13:50 <x> http://site2.com suffix
        20:13:50 <x> ftp://ftpsite1.com suffix
        20:13:50 <x> #link prefix ftp://ftpsite2.com suffix
        20:13:50 <x> irc://ircsite1.com suffix
        20:13:50 <x> mailto://a@mail.com suffix
        20:13:50 <x> #endmeeting
        """
        M = process_meeting(script)
        results = M.save()['.html']
        assert re.search(r'prefix.*href.*http://site1.com.*suffix',
                         results), "URL missing 1"
        assert re.search(r'href.*http://site2.com.*suffix',
                         results), "URL missing 2"
        assert re.search(r'href.*ftp://ftpsite1.com.*suffix',
                         results), "URL missing 3"
        assert re.search(r'prefix.*href.*ftp://ftpsite2.com.*suffix',
                         results), "URL missing 4"
        assert re.search(r'href.*mailto://a@mail.com.*suffix',
                         results), "URL missing 5"
    def t_css(self):
        """Runs all CSS-related tests.
        """
--- a/tests/test-script-1.log.txt
+++ b/tests/test-script-1.log.txt
@ -64,7 +64,15 @@
 # links
 20:13:50 <MrBeige> #topic Links
 20:13:50 <Utahraptor> #link http://test<b>.zgib.net
 20:13:50 <Utahraptor> #link ftp://test<b>.zgib.net "
 20:13:50 <Utahraptor> #link mailto://a@bla"h.com
 20:13:50 <Utahraptor> #link http://test.zgib.net/&testpage
 20:13:50 <Utahraptor> #link prefix http://test.zgib.net/&testpage suffix
 20:13:50 <Utahraptor> #link prefix ftp://test.zg"ib.net/&testpage suffix
 20:13:50 <Utahraptor> #link prefix mailto://a@blah.com&testpage suffix
 20:13:50 <Utahraptor> #link prefix http://google.com/. suffix
 20:13:50 <Utahraptor> #link prefix (http://google.com/) suffix
 # accents
 20:13:50 <MrBeige> #topic Character sets