18 lines
631 B
Pig
18 lines
631 B
Pig
input_lines = LOAD '$INPUT' AS (line:chararray);
|
|
|
|
-- filter out any lines that are not with TODO
|
|
todo_lines = FILTER input_lines BY line MATCHES '.*TODO\\s*\\(\\w+\\)+.*';
|
|
ids = FOREACH todo_lines GENERATE FLATTEN(REGEX_EXTRACT($0, '(.*)\\((.*)\\)(.*)', 2));
|
|
|
|
-- create a group for each word
|
|
id_groups = GROUP ids BY $0;
|
|
|
|
-- count the entries in each group
|
|
atc_count = FOREACH id_groups GENERATE COUNT(ids) AS count, group AS atc;
|
|
|
|
-- order the records by count
|
|
result = ORDER atc_count BY count DESC;
|
|
result = FOREACH result GENERATE count, CONCAT('https://launchpad.net/~', atc);
|
|
|
|
STORE result INTO '$OUTPUT' USING PigStorage();
|