Merge "Adding a new tools to search topic on the ML"
This commit is contained in:
commit
b2f8125915
@ -748,6 +748,74 @@ To check for ussuri release note links:
|
|||||||
|
|
||||||
tools/add_release_note_links.sh ussuri
|
tools/add_release_note_links.sh ussuri
|
||||||
|
|
||||||
|
tools/search_email.py
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
A script to search emails on the openstack-discuss mailing list. By default
|
||||||
|
this script will search for emails related to the release team, but
|
||||||
|
topic can be overriden to looking for specific subjects.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
The most basic example is the following, it will search for emails related
|
||||||
|
to releases topics on openstack-discuss during the entire life of
|
||||||
|
this mailing list (between the creation date in November 2018 to
|
||||||
|
the current date):
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
$ tools/search_emails.py
|
||||||
|
|
||||||
|
To looking for emails related to release and filtered between 2 dates:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
$ tools/search_emails.py --starting-date 2020-04-01 --ending-date 2020-4-1
|
||||||
|
|
||||||
|
To looking for emails related to release and filtered by authors:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
$ tools/search_emails.py --authors "Herve Beraud" "Sean McGinnis"
|
||||||
|
|
||||||
|
To looking for emails related to release between 2 dates and sent by authors:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
$ tools/search_emails.py --starting-date 2020-04-01 --ending-date 2020-4-1 --authors "Herve Beraud" "Sean McGinnis"
|
||||||
|
|
||||||
|
To looking for emails related to release FFE since August 2020:
|
||||||
|
|
||||||
|
::
|
||||||
|
$ tools/search_emails.py --topic ".?\[release\].*FFE.*" --starting-date 2020-8-1
|
||||||
|
|
||||||
|
To looking for all the release countdown emails sent during victoria:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
$ tools/search_emails.py --topic ".?\[release\] Release countdown.*" --starting-date 2020-5-1
|
||||||
|
|
||||||
|
By default will be executed on ``http://lists.openstack.org/pipermail/openstack-discuss``
|
||||||
|
but you can change the url to execute research on different mailing list.
|
||||||
|
|
||||||
|
In the following example we looking for all release jobs who failed for ``openstack/watcher*``:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
$ tools/search_emails.py --topic ".?openstack/watcher.*" --mailing-list http://lists.openstack.org/pipermail/
|
||||||
|
release-job-failures/ --starting-date 2016-6-1
|
||||||
|
|
||||||
|
Notice that by default we search on ``http://lists.openstack.org/pipermail/openstack-discuss``
|
||||||
|
and this mailing list was created in November 2018 so the ``--starting-date``
|
||||||
|
is initialized to this date by default, but the date can't be before this
|
||||||
|
default date except if you search on a different mailing list and if you also
|
||||||
|
override it by passing params with ``--mailing-list``.
|
||||||
|
|
||||||
|
For more usage and examples:
|
||||||
|
|
||||||
|
::
|
||||||
|
$ tools/search_emails.py -h
|
||||||
|
|
||||||
tools/membership_freeze_test.py
|
tools/membership_freeze_test.py
|
||||||
--------------------------------
|
--------------------------------
|
||||||
|
|
||||||
|
247
tools/search_emails.py
Executable file
247
tools/search_emails.py
Executable file
@ -0,0 +1,247 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
#
|
||||||
|
# All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License. You may obtain
|
||||||
|
# a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
# License for the specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import datetime
|
||||||
|
from dateutil.relativedelta import relativedelta
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import textwrap
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
MINIMAL_DATE = datetime.datetime.strptime("2018-11-1", "%Y-%m-%d")
|
||||||
|
BASE_URL = "http://lists.openstack.org/pipermail/openstack-discuss"
|
||||||
|
DEFAULT_SEARCHING_PATTERN = '.?\[release\].*' # noqa
|
||||||
|
|
||||||
|
|
||||||
|
def get(url):
|
||||||
|
"""Request a given url."""
|
||||||
|
response = requests.get(url)
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"Error with {url} server returned code {response.status_code}")
|
||||||
|
return None
|
||||||
|
return BeautifulSoup(response.content, features="html.parser")
|
||||||
|
|
||||||
|
|
||||||
|
def get_author(email):
|
||||||
|
"""Retrieve the author of an email."""
|
||||||
|
email = get(email)
|
||||||
|
if not email:
|
||||||
|
return None
|
||||||
|
return email.find_all('b')[0].string
|
||||||
|
|
||||||
|
|
||||||
|
def is_sent_by(author, authors):
|
||||||
|
"""Check if the email have been sent by one of the authors in the list.
|
||||||
|
|
||||||
|
if authors is empty true will be returned by default.
|
||||||
|
"""
|
||||||
|
if not authors:
|
||||||
|
return True
|
||||||
|
if author.lower() in [auth.lower() for auth in authors]:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def search(month_url, topic, authors):
|
||||||
|
"""Search for matching emails."""
|
||||||
|
exit = 0
|
||||||
|
results = []
|
||||||
|
soup = get(month_url)
|
||||||
|
if not soup:
|
||||||
|
exit = 1
|
||||||
|
return results, exit
|
||||||
|
for tag in soup.find_all('a', href=True):
|
||||||
|
href = tag['href']
|
||||||
|
email_subject = tag.string.replace('\n', '')
|
||||||
|
# Only collect legit urls (ML links are at the 000000.html)
|
||||||
|
if not re.match("[0-9]{6}.html", href):
|
||||||
|
continue
|
||||||
|
url = f'{month_url}/{href}'
|
||||||
|
if not re.search(topic, email_subject):
|
||||||
|
continue
|
||||||
|
author = get_author(url)
|
||||||
|
if not author:
|
||||||
|
exit = 1
|
||||||
|
continue
|
||||||
|
if not is_sent_by(author, authors):
|
||||||
|
continue
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
'url': url,
|
||||||
|
'subject': email_subject,
|
||||||
|
'author': author
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return results, exit
|
||||||
|
|
||||||
|
|
||||||
|
def display(results):
|
||||||
|
"""Display our results."""
|
||||||
|
default = 'No results found...'
|
||||||
|
count = 0
|
||||||
|
final = []
|
||||||
|
for year in results:
|
||||||
|
for month in results[year]:
|
||||||
|
if not results[year][month]['emails']:
|
||||||
|
continue
|
||||||
|
final.append(f'{year}-{month}:')
|
||||||
|
for email in results[year][month]['emails']:
|
||||||
|
final.append('\t- {subject} - {author}\n\t{url}'.format(
|
||||||
|
subject=email['subject'],
|
||||||
|
author=email['author'],
|
||||||
|
url=email['url']))
|
||||||
|
count += 1
|
||||||
|
if final and count > 0:
|
||||||
|
print(f'{count} result(s) have been found')
|
||||||
|
print('\n'.join(final))
|
||||||
|
else:
|
||||||
|
print(default)
|
||||||
|
|
||||||
|
|
||||||
|
def mailing_list_url(string):
|
||||||
|
error_msg = f'{string} is not a valid url'
|
||||||
|
try:
|
||||||
|
result = urlparse(string)
|
||||||
|
if not all([result.scheme, result.netloc, result.path]):
|
||||||
|
raise argparse.ArgumentTypeError(error_msg)
|
||||||
|
return string if not string.endswith("/") else string[:-1]
|
||||||
|
except Exception:
|
||||||
|
raise argparse.ArgumentTypeError(error_msg)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main entrypoint."""
|
||||||
|
epilog = textwrap.dedent("""
|
||||||
|
Topic:\n
|
||||||
|
Various topic can be used to looking for specific topics, by example
|
||||||
|
topic can be set to `.?\[oslo\].*` to search all emails related to
|
||||||
|
oslo.
|
||||||
|
|
||||||
|
Useful topics:
|
||||||
|
- `.?\[release\] Release countdown.*` (looking for release countdown)
|
||||||
|
- `.?\[all\].*` (looking for email related to all)
|
||||||
|
- `.?\[requirements\].*` (looking for email related to requirements)
|
||||||
|
- `.?\[release\].*FFE.*` (looking for email related to release FFE)
|
||||||
|
|
||||||
|
Usages:\n
|
||||||
|
To looking for emails related to release and filtered between 2 dates:
|
||||||
|
```
|
||||||
|
$ {cmd} --starting-date 2020-04-01 --ending-date 2020-4-1
|
||||||
|
```
|
||||||
|
To looking for emails related to release and filtered by authors:
|
||||||
|
```
|
||||||
|
$ {cmd} --authors "Herve Beraud" "Sean McGinnis
|
||||||
|
```
|
||||||
|
To looking for emails related to release between 2 dates and sent by authors:
|
||||||
|
```
|
||||||
|
$ {cmd} --starting-date 2020-04-01 --ending-date 2020-4-1 --authors "Herve Beraud" "Sean McGinnis
|
||||||
|
```
|
||||||
|
To looking for emails related to release FFE since August 2020:
|
||||||
|
```
|
||||||
|
$ {cmd} --topic ".?\[release\].*FFE.*" --starting-date 2020-8-1
|
||||||
|
```
|
||||||
|
To looking for all the release countdown emails sent during victoria (18 May 2020 - 16 October 2020):
|
||||||
|
```
|
||||||
|
$ {cmd} --topic ".?\[release\] Release countdown.*" --starting-date 2020-5-1
|
||||||
|
```
|
||||||
|
""".format(cmd=sys.argv[0])) # noqa
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Search emails on the mailing list by topic and authors',
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog=epilog)
|
||||||
|
parser.add_argument(
|
||||||
|
'-t',
|
||||||
|
'--topic',
|
||||||
|
default=DEFAULT_SEARCHING_PATTERN,
|
||||||
|
help='Regex pattern to match in emails subject.'
|
||||||
|
f'The default pattern is set to `{DEFAULT_SEARCHING_PATTERN}` '
|
||||||
|
'to looking for release topics on the ML.'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-a',
|
||||||
|
'--authors',
|
||||||
|
nargs='+',
|
||||||
|
default=[], type=str,
|
||||||
|
help='filtering on authors within emails found with the '
|
||||||
|
'related topic. Many authors can be given.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--starting-date',
|
||||||
|
type=lambda s: datetime.datetime.strptime(s, '%Y-%m-%d'),
|
||||||
|
default=MINIMAL_DATE,
|
||||||
|
help="Starting research at the given date. Can't be before "
|
||||||
|
f"{MINIMAL_DATE} which is the minimal date allowed for research."
|
||||||
|
"Notice that a research doesn't looking for a day but for an "
|
||||||
|
"entire month, in other words day will be ignored."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--ending-date',
|
||||||
|
type=lambda s: datetime.datetime.strptime(s, '%Y-%m-%d'),
|
||||||
|
default=datetime.datetime.today(),
|
||||||
|
help='Ending research at the given date.'
|
||||||
|
'Default set to today. '
|
||||||
|
"Notice that a research doesn't looking for a day but for an "
|
||||||
|
"entire month, in other words day will be ignored."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--mailing-list',
|
||||||
|
type=mailing_list_url,
|
||||||
|
default=BASE_URL,
|
||||||
|
help='Mailing list url to use for search. Should be a valid url.'
|
||||||
|
f'Default set to {BASE_URL}.'
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
exit_code = 0
|
||||||
|
mailing_list = args.mailing_list
|
||||||
|
results = {}
|
||||||
|
cursor = args.starting_date
|
||||||
|
if mailing_list == BASE_URL and cursor < MINIMAL_DATE:
|
||||||
|
print(f"--starting-date can't be inferior to {MINIMAL_DATE} "
|
||||||
|
f"with {mailing_list}")
|
||||||
|
sys.exit(1)
|
||||||
|
ending = args.ending_date
|
||||||
|
print('Looking for emails sent on {mailing_list} who match `{topic}` '
|
||||||
|
'between {start} and {end} and sent by {authors}\n...'.format(
|
||||||
|
mailing_list=mailing_list,
|
||||||
|
topic=args.topic,
|
||||||
|
start=cursor.strftime('%Y %B'),
|
||||||
|
end=ending.strftime('%Y %B'),
|
||||||
|
authors=', '.join(args.authors) if args.authors else 'anybody'))
|
||||||
|
while cursor <= ending:
|
||||||
|
year = cursor.year
|
||||||
|
month = cursor.strftime("%B")
|
||||||
|
if year not in results:
|
||||||
|
results.update({year: {}})
|
||||||
|
url = f"{mailing_list}/{year}-{month}"
|
||||||
|
print(f"Analyzing {month} {year}: {url}")
|
||||||
|
emails, current_exit_code = search(url, args.topic, args.authors)
|
||||||
|
data = {
|
||||||
|
"url": url,
|
||||||
|
"emails": emails
|
||||||
|
}
|
||||||
|
results[year].update({month: data})
|
||||||
|
cursor += relativedelta(months=1)
|
||||||
|
exit_code = current_exit_code if exit_code == 0 else exit_code
|
||||||
|
display(results)
|
||||||
|
return exit_code
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
Loading…
x
Reference in New Issue
Block a user