Merge "Adding a new tools to search topic on the ML"
This commit is contained in:
commit
b2f8125915
@ -748,6 +748,74 @@ To check for ussuri release note links:
|
||||
|
||||
tools/add_release_note_links.sh ussuri
|
||||
|
||||
tools/search_email.py
|
||||
---------------------
|
||||
|
||||
A script to search emails on the openstack-discuss mailing list. By default
|
||||
this script will search for emails related to the release team, but
|
||||
topic can be overriden to looking for specific subjects.
|
||||
|
||||
Examples:
|
||||
|
||||
The most basic example is the following, it will search for emails related
|
||||
to releases topics on openstack-discuss during the entire life of
|
||||
this mailing list (between the creation date in November 2018 to
|
||||
the current date):
|
||||
|
||||
::
|
||||
|
||||
$ tools/search_emails.py
|
||||
|
||||
To looking for emails related to release and filtered between 2 dates:
|
||||
|
||||
::
|
||||
|
||||
$ tools/search_emails.py --starting-date 2020-04-01 --ending-date 2020-4-1
|
||||
|
||||
To looking for emails related to release and filtered by authors:
|
||||
|
||||
::
|
||||
|
||||
$ tools/search_emails.py --authors "Herve Beraud" "Sean McGinnis"
|
||||
|
||||
To looking for emails related to release between 2 dates and sent by authors:
|
||||
|
||||
::
|
||||
|
||||
$ tools/search_emails.py --starting-date 2020-04-01 --ending-date 2020-4-1 --authors "Herve Beraud" "Sean McGinnis"
|
||||
|
||||
To looking for emails related to release FFE since August 2020:
|
||||
|
||||
::
|
||||
$ tools/search_emails.py --topic ".?\[release\].*FFE.*" --starting-date 2020-8-1
|
||||
|
||||
To looking for all the release countdown emails sent during victoria:
|
||||
|
||||
::
|
||||
|
||||
$ tools/search_emails.py --topic ".?\[release\] Release countdown.*" --starting-date 2020-5-1
|
||||
|
||||
By default will be executed on ``http://lists.openstack.org/pipermail/openstack-discuss``
|
||||
but you can change the url to execute research on different mailing list.
|
||||
|
||||
In the following example we looking for all release jobs who failed for ``openstack/watcher*``:
|
||||
|
||||
::
|
||||
|
||||
$ tools/search_emails.py --topic ".?openstack/watcher.*" --mailing-list http://lists.openstack.org/pipermail/
|
||||
release-job-failures/ --starting-date 2016-6-1
|
||||
|
||||
Notice that by default we search on ``http://lists.openstack.org/pipermail/openstack-discuss``
|
||||
and this mailing list was created in November 2018 so the ``--starting-date``
|
||||
is initialized to this date by default, but the date can't be before this
|
||||
default date except if you search on a different mailing list and if you also
|
||||
override it by passing params with ``--mailing-list``.
|
||||
|
||||
For more usage and examples:
|
||||
|
||||
::
|
||||
$ tools/search_emails.py -h
|
||||
|
||||
tools/membership_freeze_test.py
|
||||
--------------------------------
|
||||
|
||||
|
247
tools/search_emails.py
Executable file
247
tools/search_emails.py
Executable file
@ -0,0 +1,247 @@
|
||||
#!/usr/bin/python3
|
||||
#
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import argparse
|
||||
import datetime
|
||||
from dateutil.relativedelta import relativedelta
|
||||
import re
|
||||
import sys
|
||||
import textwrap
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
|
||||
|
||||
MINIMAL_DATE = datetime.datetime.strptime("2018-11-1", "%Y-%m-%d")
|
||||
BASE_URL = "http://lists.openstack.org/pipermail/openstack-discuss"
|
||||
DEFAULT_SEARCHING_PATTERN = '.?\[release\].*' # noqa
|
||||
|
||||
|
||||
def get(url):
|
||||
"""Request a given url."""
|
||||
response = requests.get(url)
|
||||
if response.status_code != 200:
|
||||
print(f"Error with {url} server returned code {response.status_code}")
|
||||
return None
|
||||
return BeautifulSoup(response.content, features="html.parser")
|
||||
|
||||
|
||||
def get_author(email):
|
||||
"""Retrieve the author of an email."""
|
||||
email = get(email)
|
||||
if not email:
|
||||
return None
|
||||
return email.find_all('b')[0].string
|
||||
|
||||
|
||||
def is_sent_by(author, authors):
|
||||
"""Check if the email have been sent by one of the authors in the list.
|
||||
|
||||
if authors is empty true will be returned by default.
|
||||
"""
|
||||
if not authors:
|
||||
return True
|
||||
if author.lower() in [auth.lower() for auth in authors]:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def search(month_url, topic, authors):
|
||||
"""Search for matching emails."""
|
||||
exit = 0
|
||||
results = []
|
||||
soup = get(month_url)
|
||||
if not soup:
|
||||
exit = 1
|
||||
return results, exit
|
||||
for tag in soup.find_all('a', href=True):
|
||||
href = tag['href']
|
||||
email_subject = tag.string.replace('\n', '')
|
||||
# Only collect legit urls (ML links are at the 000000.html)
|
||||
if not re.match("[0-9]{6}.html", href):
|
||||
continue
|
||||
url = f'{month_url}/{href}'
|
||||
if not re.search(topic, email_subject):
|
||||
continue
|
||||
author = get_author(url)
|
||||
if not author:
|
||||
exit = 1
|
||||
continue
|
||||
if not is_sent_by(author, authors):
|
||||
continue
|
||||
results.append(
|
||||
{
|
||||
'url': url,
|
||||
'subject': email_subject,
|
||||
'author': author
|
||||
}
|
||||
)
|
||||
return results, exit
|
||||
|
||||
|
||||
def display(results):
|
||||
"""Display our results."""
|
||||
default = 'No results found...'
|
||||
count = 0
|
||||
final = []
|
||||
for year in results:
|
||||
for month in results[year]:
|
||||
if not results[year][month]['emails']:
|
||||
continue
|
||||
final.append(f'{year}-{month}:')
|
||||
for email in results[year][month]['emails']:
|
||||
final.append('\t- {subject} - {author}\n\t{url}'.format(
|
||||
subject=email['subject'],
|
||||
author=email['author'],
|
||||
url=email['url']))
|
||||
count += 1
|
||||
if final and count > 0:
|
||||
print(f'{count} result(s) have been found')
|
||||
print('\n'.join(final))
|
||||
else:
|
||||
print(default)
|
||||
|
||||
|
||||
def mailing_list_url(string):
|
||||
error_msg = f'{string} is not a valid url'
|
||||
try:
|
||||
result = urlparse(string)
|
||||
if not all([result.scheme, result.netloc, result.path]):
|
||||
raise argparse.ArgumentTypeError(error_msg)
|
||||
return string if not string.endswith("/") else string[:-1]
|
||||
except Exception:
|
||||
raise argparse.ArgumentTypeError(error_msg)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entrypoint."""
|
||||
epilog = textwrap.dedent("""
|
||||
Topic:\n
|
||||
Various topic can be used to looking for specific topics, by example
|
||||
topic can be set to `.?\[oslo\].*` to search all emails related to
|
||||
oslo.
|
||||
|
||||
Useful topics:
|
||||
- `.?\[release\] Release countdown.*` (looking for release countdown)
|
||||
- `.?\[all\].*` (looking for email related to all)
|
||||
- `.?\[requirements\].*` (looking for email related to requirements)
|
||||
- `.?\[release\].*FFE.*` (looking for email related to release FFE)
|
||||
|
||||
Usages:\n
|
||||
To looking for emails related to release and filtered between 2 dates:
|
||||
```
|
||||
$ {cmd} --starting-date 2020-04-01 --ending-date 2020-4-1
|
||||
```
|
||||
To looking for emails related to release and filtered by authors:
|
||||
```
|
||||
$ {cmd} --authors "Herve Beraud" "Sean McGinnis
|
||||
```
|
||||
To looking for emails related to release between 2 dates and sent by authors:
|
||||
```
|
||||
$ {cmd} --starting-date 2020-04-01 --ending-date 2020-4-1 --authors "Herve Beraud" "Sean McGinnis
|
||||
```
|
||||
To looking for emails related to release FFE since August 2020:
|
||||
```
|
||||
$ {cmd} --topic ".?\[release\].*FFE.*" --starting-date 2020-8-1
|
||||
```
|
||||
To looking for all the release countdown emails sent during victoria (18 May 2020 - 16 October 2020):
|
||||
```
|
||||
$ {cmd} --topic ".?\[release\] Release countdown.*" --starting-date 2020-5-1
|
||||
```
|
||||
""".format(cmd=sys.argv[0])) # noqa
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Search emails on the mailing list by topic and authors',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=epilog)
|
||||
parser.add_argument(
|
||||
'-t',
|
||||
'--topic',
|
||||
default=DEFAULT_SEARCHING_PATTERN,
|
||||
help='Regex pattern to match in emails subject.'
|
||||
f'The default pattern is set to `{DEFAULT_SEARCHING_PATTERN}` '
|
||||
'to looking for release topics on the ML.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-a',
|
||||
'--authors',
|
||||
nargs='+',
|
||||
default=[], type=str,
|
||||
help='filtering on authors within emails found with the '
|
||||
'related topic. Many authors can be given.')
|
||||
parser.add_argument(
|
||||
'--starting-date',
|
||||
type=lambda s: datetime.datetime.strptime(s, '%Y-%m-%d'),
|
||||
default=MINIMAL_DATE,
|
||||
help="Starting research at the given date. Can't be before "
|
||||
f"{MINIMAL_DATE} which is the minimal date allowed for research."
|
||||
"Notice that a research doesn't looking for a day but for an "
|
||||
"entire month, in other words day will be ignored."
|
||||
)
|
||||
parser.add_argument(
|
||||
'--ending-date',
|
||||
type=lambda s: datetime.datetime.strptime(s, '%Y-%m-%d'),
|
||||
default=datetime.datetime.today(),
|
||||
help='Ending research at the given date.'
|
||||
'Default set to today. '
|
||||
"Notice that a research doesn't looking for a day but for an "
|
||||
"entire month, in other words day will be ignored."
|
||||
)
|
||||
parser.add_argument(
|
||||
'--mailing-list',
|
||||
type=mailing_list_url,
|
||||
default=BASE_URL,
|
||||
help='Mailing list url to use for search. Should be a valid url.'
|
||||
f'Default set to {BASE_URL}.'
|
||||
)
|
||||
args = parser.parse_args()
|
||||
exit_code = 0
|
||||
mailing_list = args.mailing_list
|
||||
results = {}
|
||||
cursor = args.starting_date
|
||||
if mailing_list == BASE_URL and cursor < MINIMAL_DATE:
|
||||
print(f"--starting-date can't be inferior to {MINIMAL_DATE} "
|
||||
f"with {mailing_list}")
|
||||
sys.exit(1)
|
||||
ending = args.ending_date
|
||||
print('Looking for emails sent on {mailing_list} who match `{topic}` '
|
||||
'between {start} and {end} and sent by {authors}\n...'.format(
|
||||
mailing_list=mailing_list,
|
||||
topic=args.topic,
|
||||
start=cursor.strftime('%Y %B'),
|
||||
end=ending.strftime('%Y %B'),
|
||||
authors=', '.join(args.authors) if args.authors else 'anybody'))
|
||||
while cursor <= ending:
|
||||
year = cursor.year
|
||||
month = cursor.strftime("%B")
|
||||
if year not in results:
|
||||
results.update({year: {}})
|
||||
url = f"{mailing_list}/{year}-{month}"
|
||||
print(f"Analyzing {month} {year}: {url}")
|
||||
emails, current_exit_code = search(url, args.topic, args.authors)
|
||||
data = {
|
||||
"url": url,
|
||||
"emails": emails
|
||||
}
|
||||
results[year].update({month: data})
|
||||
cursor += relativedelta(months=1)
|
||||
exit_code = current_exit_code if exit_code == 0 else exit_code
|
||||
display(results)
|
||||
return exit_code
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
Loading…
Reference in New Issue
Block a user