.. _cli::p3-collate:


##########
p3-collate
##########


***************************************************
Extract the First N Rows for Each Value of a Column
***************************************************


.. code-block:: perl

     p3-collate.pl N [options]


This script will read through a tab-delimited file on the standard input and output the first N rows for each specific
value found in the key column. For example, if we have a set of genomes sorted by quality and we ask for a 3-row sample
based on the species column, it will extract the 3 best-quality genomes for each species.

Parameters
==========


The positional parameter must be the number of rows to extract for each value.

The standard input can be overridden using the options in :ref:`cli-input-options`.

Use the options in :ref:`cli-column-options` to select the key column.
=cut

use strict;
use P3DataAPI;
use P3Utils;

# Get the command-line options.
my $opt = P3Utils::script_opts('N', P3Utils::col_options(), P3Utils::ih_options(),
        );
# Open the input file.
my $ih = P3Utils::ih($opt);
# Read the incoming headers.
my ($outHeaders, $keyCol) = P3Utils::process_headers($ih, $opt);
# Form the full header set and write it out.
if (! $opt->nohead) {
    P3Utils::print_cols($outHeaders);
}
# Get the number of rows to sample. The default is 1.
my ($N) = @ARGV;
$N //= 1;
if ($N < 1) {
    die "Collation specifies no output.";
}
# This is the collation hash.
my %groups;
# Loop through the input.
while (! eof $ih) {
    my $couplets = P3Utils::get_couplets($ih, $keyCol, $opt);
    for my $couplet (@$couplets) {
        my ($key, $row) = @$couplet;
        if (! $groups{$key}) {
            $groups{$key} = [$row];
        } else {
            my $group = $groups{$key};
            if (scalar @$group < $N) {
                push @$group, $row;
            }
        }
    }
}
# Write the output.
for my $key (sort keys %groups) {
    my $group = $groups{$key};
    for my $row (@$group) {
        P3Utils::print_cols($row);
    }
}