Start parsing links from inbound email

Implement parsing text contained in <a> tags enabling users to also
include links in their comment replies posted using inbound email.

Historically, we were holding back on this for two reasons:
* It makes parsing email more complex and fragile
* Spammers can use this to post spammy links on Gerrit

Inbound email is live on gerrit-review.googlesource.com for almost a
year now and we have seen little to no spam. It is enabled on the
majority of Gerrit sites hosted at Google and we have only seen a single
instance of spammy comments. Therefore, the second concern is gone.

This commit adapts the inbound HTML parser and makes some parts of it
more strict to mitigate the first concern. It also adapts some of the
tests as the HTML was incorrect which is surfaced by the enforcement of
stricter parsing rules.

We parse the text contained in the <a> tag, because we expect users to
just copy and paste a link into their email response and have their
email client turn it into a link automatically rather than have them
mingle with a word that they turn into a link manually.

Bug: Issue 7553
Change-Id: I70ca1977554d9f218280abe188d670ea41daa1f4
This commit is contained in:
Patrick Hiesel
2017-11-27 10:48:19 +01:00
parent a499726ace
commit 78fe705caa
6 changed files with 91 additions and 36 deletions

View File

@@ -71,7 +71,12 @@ public class HtmlParser {
for (Element e : d.body().getAllElements()) {
String elementName = e.tagName();
boolean isInBlockQuote =
e.parents().stream().filter(p -> p.tagName().equals("blockquote")).findAny().isPresent();
e.parents()
.stream()
.anyMatch(
p ->
p.tagName().equals("blockquote")
|| MAIL_PROVIDER_EXTRAS.contains(p.className()));
if (elementName.equals("a")) {
String href = e.attr("href");
@@ -96,39 +101,63 @@ public class HtmlParser {
lastEncounteredComment = perspectiveComment;
iter.next();
}
continue;
} else if (ParserUtil.isCommentUrl(href, changeUrl, perspectiveComment)) {
// This is a regular inline comment
lastEncounteredComment = perspectiveComment;
iter.next();
continue;
}
} else if (!isInBlockQuote
&& elementName.equals("div")
&& !MAIL_PROVIDER_EXTRAS.contains(e.className())) {
// This is a comment typed by the user
// Replace non-breaking spaces and trim string
String content = e.ownText().replace('\u00a0', ' ').trim();
if (!Strings.isNullOrEmpty(content)) {
if (lastEncounteredComment == null && lastEncounteredFileName == null) {
// Remove quotation line, email signature and
// "Sent from my xyz device"
content = ParserUtil.trimQuotation(content);
// TODO(hiesel) Add more sanitizer
if (!Strings.isNullOrEmpty(content)) {
ParserUtil.appendOrAddNewComment(
new MailComment(content, null, null, MailComment.CommentType.CHANGE_MESSAGE),
parsedComments);
}
} else if (lastEncounteredComment == null) {
}
if (isInBlockQuote) {
// There is no user-input in quoted text
continue;
}
if (!elementName.equals("div") && !elementName.equals("a")) {
// We only accept div and a since these tags contain user input
continue;
}
if (elementName.equals("a") && e.attr("href").startsWith("mailto:")) {
// We don't accept mailto: links in general as they often appear in reply-to lines
// (User<user@gmail.com> wrote: ...)
continue;
}
// This is a comment typed by the user
// Replace non-breaking spaces and trim string
String content = e.ownText().replace('\u00a0', ' ').trim();
boolean isLink = elementName.equals("a");
if (!Strings.isNullOrEmpty(content)) {
if (lastEncounteredComment == null && lastEncounteredFileName == null) {
// Remove quotation line, email signature and
// "Sent from my xyz device"
content = ParserUtil.trimQuotation(content);
// TODO(hiesel) Add more sanitizer
if (!Strings.isNullOrEmpty(content)) {
ParserUtil.appendOrAddNewComment(
new MailComment(
content, lastEncounteredFileName, null, MailComment.CommentType.FILE_COMMENT),
parsedComments);
} else {
ParserUtil.appendOrAddNewComment(
new MailComment(
content, null, lastEncounteredComment, MailComment.CommentType.INLINE_COMMENT),
content, null, null, MailComment.CommentType.CHANGE_MESSAGE, isLink),
parsedComments);
}
} else if (lastEncounteredComment == null) {
ParserUtil.appendOrAddNewComment(
new MailComment(
content,
lastEncounteredFileName,
null,
MailComment.CommentType.FILE_COMMENT,
isLink),
parsedComments);
} else {
ParserUtil.appendOrAddNewComment(
new MailComment(
content,
null,
lastEncounteredComment,
MailComment.CommentType.INLINE_COMMENT,
isLink),
parsedComments);
}
}
}

View File

@@ -29,14 +29,17 @@ public class MailComment {
Comment inReplyTo;
String fileName;
String message;
boolean isLink;
public MailComment() {}
public MailComment(String message, String fileName, Comment inReplyTo, CommentType type) {
public MailComment(
String message, String fileName, Comment inReplyTo, CommentType type, boolean isLink) {
this.message = message;
this.fileName = fileName;
this.inReplyTo = inReplyTo;
this.type = type;
this.isLink = isLink;
}
/**

View File

@@ -95,8 +95,9 @@ public class ParserUtil {
MailComment lastComment = Iterables.getLast(comments);
if (comment.isSameCommentPath(lastComment)) {
// Merge the two comments
lastComment.message += "\n\n" + comment.message;
// Merge the two comments. Links should just be appended, while regular text that came from
// different <div> elements should be separated by a paragraph.
lastComment.message += (comment.isLink ? " " : "\n\n") + comment.message;
return;
}

View File

@@ -96,7 +96,8 @@ public class AbstractParserTest {
comments.add(newComment("c1", "gerrit-server/test.txt", "comment", 0));
comments.add(newComment("c2", "gerrit-server/test.txt", "comment", 2));
comments.add(newComment("c3", "gerrit-server/test.txt", "comment", 3));
comments.add(newRangeComment("c4", "gerrit-server/readme.txt", "comment", 3));
comments.add(newComment("c3", "gerrit-server/test.txt", "comment", 115));
comments.add(newRangeComment("c5", "gerrit-server/readme.txt", "comment", 3));
return comments;
}
}

View File

@@ -26,7 +26,7 @@ public class GmailHtmlParserTest extends HtmlParserTest {
+ "On Fri, Nov 18, 2016 at 11:15 AM, foobar (Gerrit) noreply@gerrit.com"
+ "<span dir=\"ltr\">&lt;<a href=\"mailto:noreply@gerrit.com\" "
+ "target=\"_blank\">noreply@gerrit.com</a>&gt;</span> wrote:<br>"
+ "<blockquote class=\"gmail_quote\" "
+ "</div></div><blockquote class=\"gmail_quote\" "
+ "<p>foobar <strong>posted comments</strong> on this change.</p>"
+ "<p><a href=\""
+ CHANGE_URL
@@ -41,7 +41,6 @@ public class GmailHtmlParserTest extends HtmlParserTest {
+ "/1/gerrit-server/test.txt\">"
+ "File gerrit-server/<wbr>test.txt:</a></p>"
+ commentBlock(f1)
+ "<li><p>"
+ "<a href=\""
+ CHANGE_URL
+ "/1/gerrit-server/test.txt\">"
@@ -105,7 +104,7 @@ public class GmailHtmlParserTest extends HtmlParserTest {
+ "</p><p>Gerrit-MessageType: comment<br>"
+ "Footer omitted</p>"
+ "<div><div></div></div>"
+ "<p>Gerrit-HasComments: Yes</p></blockquote></div><br></div></div>";
+ "<p>Gerrit-HasComments: Yes</p></blockquote></div>";
return email;
}

View File

@@ -38,6 +38,28 @@ public abstract class HtmlParserTest extends AbstractParserTest {
assertChangeMessage("Looks good to me", parsedComments.get(0));
}
@Test
public void changeMessageWithLink() {
MailMessage.Builder b = newMailMessageBuilder();
b.htmlContent(
newHtmlBody(
"Did you consider this: "
+ "<a href=\"http://gerritcodereview.com\">http://gerritcodereview.com</a>",
null,
null,
null,
null,
null,
null));
List<Comment> comments = defaultComments();
List<MailComment> parsedComments = HtmlParser.parse(b.build(), comments, "");
assertThat(parsedComments).hasSize(1);
assertChangeMessage(
"Did you consider this: http://gerritcodereview.com", parsedComments.get(0));
}
@Test
public void simpleInlineComments() {
MailMessage.Builder b = newMailMessageBuilder();
@@ -57,7 +79,7 @@ public abstract class HtmlParserTest extends AbstractParserTest {
assertThat(parsedComments).hasSize(3);
assertChangeMessage("Looks good to me", parsedComments.get(0));
assertInlineComment("I have a comment on this.", parsedComments.get(1), comments.get(1));
assertInlineComment("Also have a comment here.", parsedComments.get(2), comments.get(3));
assertInlineComment("Also have a comment here.", parsedComments.get(2), comments.get(4));
}
@Test
@@ -79,7 +101,7 @@ public abstract class HtmlParserTest extends AbstractParserTest {
assertThat(parsedComments).hasSize(3);
assertChangeMessage("Looks good to me", parsedComments.get(0));
assertFileComment("This is a nice file", parsedComments.get(1), comments.get(1).key.filename);
assertInlineComment("Also have a comment here.", parsedComments.get(2), comments.get(3));
assertInlineComment("Also have a comment here.", parsedComments.get(2), comments.get(4));
}
@Test
@@ -105,7 +127,7 @@ public abstract class HtmlParserTest extends AbstractParserTest {
assertThat(parsedComments).hasSize(2);
assertFileComment("This is a nice file", parsedComments.get(0), comments.get(1).key.filename);
assertInlineComment("Also have a comment here.", parsedComments.get(1), comments.get(3));
assertInlineComment("Also have a comment here.", parsedComments.get(1), comments.get(4));
}
@Test
@@ -122,7 +144,7 @@ public abstract class HtmlParserTest extends AbstractParserTest {
assertThat(parsedComments).hasSize(3);
assertChangeMessage(txtMessage, parsedComments.get(0));
assertFileComment(txtMessage, parsedComments.get(1), comments.get(1).key.filename);
assertInlineComment(txtMessage, parsedComments.get(2), comments.get(3));
assertInlineComment(txtMessage, parsedComments.get(2), comments.get(4));
}
/**