Start parsing links from inbound email

Implement parsing text contained in <a> tags enabling users to also include links in their comment replies posted using inbound email. Historically, we were holding back on this for two reasons: * It makes parsing email more complex and fragile * Spammers can use this to post spammy links on Gerrit Inbound email is live on gerrit-review.googlesource.com for almost a year now and we have seen little to no spam. It is enabled on the majority of Gerrit sites hosted at Google and we have only seen a single instance of spammy comments. Therefore, the second concern is gone. This commit adapts the inbound HTML parser and makes some parts of it more strict to mitigate the first concern. It also adapts some of the tests as the HTML was incorrect which is surfaced by the enforcement of stricter parsing rules. We parse the text contained in the <a> tag, because we expect users to just copy and paste a link into their email response and have their email client turn it into a link automatically rather than have them mingle with a word that they turn into a link manually. Bug: Issue 7553 Change-Id: I70ca1977554d9f218280abe188d670ea41daa1f4
2017-11-27 10:48:19 +01:00
parent a499726ace
commit 78fe705caa
6 changed files with 91 additions and 36 deletions
--- a/java/com/google/gerrit/server/mail/receive/HtmlParser.java
+++ b/java/com/google/gerrit/server/mail/receive/HtmlParser.java
@@ -71,7 +71,12 @@ public class HtmlParser {
    for (Element e : d.body().getAllElements()) {
      String elementName = e.tagName();
      boolean isInBlockQuote =
-          e.parents().stream().filter(p -> p.tagName().equals("blockquote")).findAny().isPresent();
+          e.parents()
+              .stream()
+              .anyMatch(
+                  p ->
+                      p.tagName().equals("blockquote")
+                          || MAIL_PROVIDER_EXTRAS.contains(p.className()));

      if (elementName.equals("a")) {
        String href = e.attr("href");
@@ -96,39 +101,63 @@ public class HtmlParser {
            lastEncounteredComment = perspectiveComment;
            iter.next();
          }
+          continue;
        } else if (ParserUtil.isCommentUrl(href, changeUrl, perspectiveComment)) {
          // This is a regular inline comment
          lastEncounteredComment = perspectiveComment;
          iter.next();
+          continue;
        }
-      } else if (!isInBlockQuote
-          && elementName.equals("div")
-          && !MAIL_PROVIDER_EXTRAS.contains(e.className())) {
-        // This is a comment typed by the user
-        // Replace non-breaking spaces and trim string
-        String content = e.ownText().replace('\u00a0', ' ').trim();
-        if (!Strings.isNullOrEmpty(content)) {
-          if (lastEncounteredComment == null && lastEncounteredFileName == null) {
-            // Remove quotation line, email signature and
-            // "Sent from my xyz device"
-            content = ParserUtil.trimQuotation(content);
-            // TODO(hiesel) Add more sanitizer
-            if (!Strings.isNullOrEmpty(content)) {
-              ParserUtil.appendOrAddNewComment(
-                  new MailComment(content, null, null, MailComment.CommentType.CHANGE_MESSAGE),
-                  parsedComments);
-            }
-          } else if (lastEncounteredComment == null) {
+      }
+
+      if (isInBlockQuote) {
+        // There is no user-input in quoted text
+        continue;
+      }
+      if (!elementName.equals("div") && !elementName.equals("a")) {
+        // We only accept div and a since these tags contain user input
+        continue;
+      }
+      if (elementName.equals("a") && e.attr("href").startsWith("mailto:")) {
+        // We don't accept mailto: links in general as they often appear in reply-to lines
+        // (User<user@gmail.com> wrote: ...)
+        continue;
+      }
+
+      // This is a comment typed by the user
+      // Replace non-breaking spaces and trim string
+      String content = e.ownText().replace('\u00a0', ' ').trim();
+      boolean isLink = elementName.equals("a");
+      if (!Strings.isNullOrEmpty(content)) {
+        if (lastEncounteredComment == null && lastEncounteredFileName == null) {
+          // Remove quotation line, email signature and
+          // "Sent from my xyz device"
+          content = ParserUtil.trimQuotation(content);
+          // TODO(hiesel) Add more sanitizer
+          if (!Strings.isNullOrEmpty(content)) {
            ParserUtil.appendOrAddNewComment(
                new MailComment(
-                    content, lastEncounteredFileName, null, MailComment.CommentType.FILE_COMMENT),
-                parsedComments);
-          } else {
-            ParserUtil.appendOrAddNewComment(
-                new MailComment(
-                    content, null, lastEncounteredComment, MailComment.CommentType.INLINE_COMMENT),
+                    content, null, null, MailComment.CommentType.CHANGE_MESSAGE, isLink),
                parsedComments);
          }
+        } else if (lastEncounteredComment == null) {
+          ParserUtil.appendOrAddNewComment(
+              new MailComment(
+                  content,
+                  lastEncounteredFileName,
+                  null,
+                  MailComment.CommentType.FILE_COMMENT,
+                  isLink),
+              parsedComments);
+        } else {
+          ParserUtil.appendOrAddNewComment(
+              new MailComment(
+                  content,
+                  null,
+                  lastEncounteredComment,
+                  MailComment.CommentType.INLINE_COMMENT,
+                  isLink),
+              parsedComments);
        }
      }
    }
--- a/java/com/google/gerrit/server/mail/receive/MailComment.java
+++ b/java/com/google/gerrit/server/mail/receive/MailComment.java
@@ -29,14 +29,17 @@ public class MailComment {
  Comment inReplyTo;
  String fileName;
  String message;
+  boolean isLink;

  public MailComment() {}

-  public MailComment(String message, String fileName, Comment inReplyTo, CommentType type) {
+  public MailComment(
+      String message, String fileName, Comment inReplyTo, CommentType type, boolean isLink) {
    this.message = message;
    this.fileName = fileName;
    this.inReplyTo = inReplyTo;
    this.type = type;
+    this.isLink = isLink;
  }

  /**
--- a/java/com/google/gerrit/server/mail/receive/ParserUtil.java
+++ b/java/com/google/gerrit/server/mail/receive/ParserUtil.java
@@ -95,8 +95,9 @@ public class ParserUtil {
    MailComment lastComment = Iterables.getLast(comments);

    if (comment.isSameCommentPath(lastComment)) {
-      // Merge the two comments
-      lastComment.message += "\n\n" + comment.message;
+      // Merge the two comments. Links should just be appended, while regular text that came from
+      // different <div> elements should be separated by a paragraph.
+      lastComment.message += (comment.isLink ? " " : "\n\n") + comment.message;
      return;
    }

--- a/javatests/com/google/gerrit/server/mail/receive/AbstractParserTest.java
+++ b/javatests/com/google/gerrit/server/mail/receive/AbstractParserTest.java
@@ -96,7 +96,8 @@ public class AbstractParserTest {
    comments.add(newComment("c1", "gerrit-server/test.txt", "comment", 0));
    comments.add(newComment("c2", "gerrit-server/test.txt", "comment", 2));
    comments.add(newComment("c3", "gerrit-server/test.txt", "comment", 3));
-    comments.add(newRangeComment("c4", "gerrit-server/readme.txt", "comment", 3));
+    comments.add(newComment("c3", "gerrit-server/test.txt", "comment", 115));
+    comments.add(newRangeComment("c5", "gerrit-server/readme.txt", "comment", 3));
    return comments;
  }
 }
--- a/javatests/com/google/gerrit/server/mail/receive/GmailHtmlParserTest.java
+++ b/javatests/com/google/gerrit/server/mail/receive/GmailHtmlParserTest.java
@@ -26,7 +26,7 @@ public class GmailHtmlParserTest extends HtmlParserTest {
            + "On Fri, Nov 18, 2016 at 11:15 AM, foobar (Gerrit) noreply@gerrit.com"
            + "<span dir=\"ltr\">&lt;<a href=\"mailto:noreply@gerrit.com\" "
            + "target=\"_blank\">noreply@gerrit.com</a>&gt;</span> wrote:<br>"
-            + "<blockquote class=\"gmail_quote\" "
+            + "</div></div><blockquote class=\"gmail_quote\" "
            + "<p>foobar <strong>posted comments</strong> on this change.</p>"
            + "<p><a href=\""
            + CHANGE_URL
@@ -41,7 +41,6 @@ public class GmailHtmlParserTest extends HtmlParserTest {
            + "/1/gerrit-server/test.txt\">"
            + "File gerrit-server/<wbr>test.txt:</a></p>"
            + commentBlock(f1)
-            + "<li><p>"
            + "<a href=\""
            + CHANGE_URL
            + "/1/gerrit-server/test.txt\">"
@@ -105,7 +104,7 @@ public class GmailHtmlParserTest extends HtmlParserTest {
            + "</p><p>Gerrit-MessageType: comment<br>"
            + "Footer omitted</p>"
            + "<div><div></div></div>"
-            + "<p>Gerrit-HasComments: Yes</p></blockquote></div><br></div></div>";
+            + "<p>Gerrit-HasComments: Yes</p></blockquote></div>";
    return email;
  }

--- a/javatests/com/google/gerrit/server/mail/receive/HtmlParserTest.java
+++ b/javatests/com/google/gerrit/server/mail/receive/HtmlParserTest.java
@@ -38,6 +38,28 @@ public abstract class HtmlParserTest extends AbstractParserTest {
    assertChangeMessage("Looks good to me", parsedComments.get(0));
  }

+  @Test
+  public void changeMessageWithLink() {
+    MailMessage.Builder b = newMailMessageBuilder();
+    b.htmlContent(
+        newHtmlBody(
+            "Did you consider this: "
+                + "<a href=\"http://gerritcodereview.com\">http://gerritcodereview.com</a>",
+            null,
+            null,
+            null,
+            null,
+            null,
+            null));
+
+    List<Comment> comments = defaultComments();
+    List<MailComment> parsedComments = HtmlParser.parse(b.build(), comments, "");
+
+    assertThat(parsedComments).hasSize(1);
+    assertChangeMessage(
+        "Did you consider this: http://gerritcodereview.com", parsedComments.get(0));
+  }
+
  @Test
  public void simpleInlineComments() {
    MailMessage.Builder b = newMailMessageBuilder();
@@ -57,7 +79,7 @@ public abstract class HtmlParserTest extends AbstractParserTest {
    assertThat(parsedComments).hasSize(3);
    assertChangeMessage("Looks good to me", parsedComments.get(0));
    assertInlineComment("I have a comment on this.", parsedComments.get(1), comments.get(1));
-    assertInlineComment("Also have a comment here.", parsedComments.get(2), comments.get(3));
+    assertInlineComment("Also have a comment here.", parsedComments.get(2), comments.get(4));
  }

  @Test
@@ -79,7 +101,7 @@ public abstract class HtmlParserTest extends AbstractParserTest {
    assertThat(parsedComments).hasSize(3);
    assertChangeMessage("Looks good to me", parsedComments.get(0));
    assertFileComment("This is a nice file", parsedComments.get(1), comments.get(1).key.filename);
-    assertInlineComment("Also have a comment here.", parsedComments.get(2), comments.get(3));
+    assertInlineComment("Also have a comment here.", parsedComments.get(2), comments.get(4));
  }

  @Test
@@ -105,7 +127,7 @@ public abstract class HtmlParserTest extends AbstractParserTest {

    assertThat(parsedComments).hasSize(2);
    assertFileComment("This is a nice file", parsedComments.get(0), comments.get(1).key.filename);
-    assertInlineComment("Also have a comment here.", parsedComments.get(1), comments.get(3));
+    assertInlineComment("Also have a comment here.", parsedComments.get(1), comments.get(4));
  }

  @Test
@@ -122,7 +144,7 @@ public abstract class HtmlParserTest extends AbstractParserTest {
    assertThat(parsedComments).hasSize(3);
    assertChangeMessage(txtMessage, parsedComments.get(0));
    assertFileComment(txtMessage, parsedComments.get(1), comments.get(1).key.filename);
-    assertInlineComment(txtMessage, parsedComments.get(2), comments.get(3));
+    assertInlineComment(txtMessage, parsedComments.get(2), comments.get(4));
  }

  /**