sahara-tests/sahara_tests/scenario/defaults/edp-examples/edp-pig/top-todoers/example.pig

18 lines
631 B
Pig

input_lines = LOAD '$INPUT' AS (line:chararray);
-- filter out any lines that are not with TODO
todo_lines = FILTER input_lines BY line MATCHES '.*TODO\\s*\\(\\w+\\)+.*';
ids = FOREACH todo_lines GENERATE FLATTEN(REGEX_EXTRACT($0, '(.*)\\((.*)\\)(.*)', 2));
-- create a group for each word
id_groups = GROUP ids BY $0;
-- count the entries in each group
atc_count = FOREACH id_groups GENERATE COUNT(ids) AS count, group AS atc;
-- order the records by count
result = ORDER atc_count BY count DESC;
result = FOREACH result GENERATE count, CONCAT('https://launchpad.net/~', atc);
STORE result INTO '$OUTPUT' USING PigStorage();