#!/usr/bin/perl -w

#
# rssacfin_to_rssacyaml
#
# Copyright (C) 2016 University of Southern California.
# All rights reserved.                                            
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License,
# version 2, as published by the Free Software Foundation.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# 

=head1 NAME

rssacint_to_rssacyaml - RSSAC intermediate format (post-counted) to final RSSAC YAML

=head1 SYNOPSIS

rssacint_to_rssacyaml < foo.rssacint

=head1 DESCRIPTION

This program reads RSSAC intermediate-format files,
after they have been IP counted, and generates 
final RSSAC YAML output.

It runs in linear time and has constant memory usage.

=head1 OPTIONS

=over

=item B<--directory> DESTROOT or B<-D> DESTROOT

Write output into an RSSAC tree rooted at DESTROOT.
Under DESTROOT will a file in YYYY/MM/PART/rootname-YYYYMMDD-PART.yaml.
Default: none and all output is written to standard output.

=item B<--service> FULL_SERVICE

Specify the full service name.  Typically x.root-servers.net.
What appears in the "service" field inside an RSSAC YAML file.
Default: unknown.example.net.

=item B<--short-service> SHORT_SERVICE

Specify the short service name.  Typically x-root.
What appears in the filename of an RSSAC YAML file.
Default: unknown.

=item B<--period> PERIOD

Specifies the day (PERIOD) under consideration, in ISO-date format (2015-12-27).
Default: inferred from the data, assuming it's about 24 hours (within 10 minutes).

=item B<--version> VERSION

Specifies the RSSAC-002 VERSION to report, either rssac002v2 or rssac002v3 (or just v2 or v3).
Default: rssac002v3.

=item B<-d>

Enable debugging output.

=item B<-v>

Enable verbose output.

=item B<--help>

Show help.

=item B<--man>

Show full manual.


=head1 SAMPLE USAGE

=head2 Input

=head2 Command

=head2 Output

=back

=cut

use strict;
use Pod::Usage;
use Getopt::Long;
use DateTime;
use DateTime::Format::ISO8601;

Getopt::Long::Configure ("bundling");
pod2usage(2) if ($#ARGV >= 0 && $ARGV[0] eq '-?');
#my(@orig_argv) = @ARGV;
my($prog) = $0;
my $debug = undef;
my $verbose = undef;
my $service = 'unknown.example.net';
my $short_service = 'unknown';
my $period = undef;
my $precise_period = undef;
my $out_dir = undef;
my $version = 'v3';
&GetOptions(
	'D|directory=s' => \$out_dir,
	'period=s' => \$period,
	'precise-period!' => \$precise_period,
	'short-service=s' => \$short_service,
	'service=s' => \$service,
	'version=s' => \$version,
 	'help|?' => sub { pod2usage(1); },
	'man' => sub { pod2usage(-verbose => 2); },
	'd|debug+' => \$debug,   
        'v|verbose+' => \$verbose) or pod2usage(2);

if (!defined($version)) {
    die "rssacfin_to_rssacyaml: no --version given\n";
};
$version = lc($version);
$version =~ s/^rssac002//;
if (!($version eq 'v2' || $version eq 'v3')) {
    die "rssacfin_to_rssacyaml: --version must be v2 or v3\n";
};


#die "$prog: must specify a service with --service=x.rootservers.net\n"
#    unless ($service);
# can auto-infer period
#die "$prog: must specify a period with --period=2015-12-27\n"
#    unless ($period);
my($in_schema) = "#fsdb -F t key count";
binmode STDOUT, ":utf8";

my $period_start_dt = undef;
my $period_end_dt = undef;
my(%fin) = ();  # the data!

sub infer_dt_from_fin {
    my($fin_field, $offset) = @_;
    $offset //= 0;
    die "$prog: trying to infer date from nonexistent $fin_field\n"
	unless ($fin{$fin_field});
    return DateTime->from_epoch(epoch => $fin{$fin_field} + $offset);
}

sub infer_period {
    return if (defined($period_start_dt) && defined($period_end_dt));
    if (defined($period)) {
	# just compute dates
	die ("$prog: date must be ISO8601 YYYY-MM-DD format, not $period\n")
	    if ($period !~ /^\d{4}-\d{2}-\d{2}$/);
	$period_start_dt = DateTime::Format::ISO8601->parse_datetime($period . "T00:00:00Z");
	$period_end_dt = DateTime::Format::ISO8601->parse_datetime($period . "T23:59:59Z");
    } elsif ($precise_period) {
	$period_start_dt = infer_dt_from_fin('<ts');
	$period_end_dt = infer_dt_from_fin('>te');
    } else {
	my($first_dt) = infer_dt_from_fin('<ts', 600);
	my($last_dt) = infer_dt_from_fin('>te', -600);
	die "$prog: <ts and >te dates don't appear to be on same day.\n"
	    if ($first_dt->year != $last_dt->year ||
		$first_dt->month != $last_dt->month ||
		$first_dt->day != $last_dt->day);
	$period_start_dt = DateTime->new(
	    year => $first_dt->year,
	    month => $first_dt->month,
	    day => $first_dt->day,
	    hour => 0,
	    minute => 0,
	    second => 0,
	    time_zone => 'UTC');
	$period_end_dt = DateTime->new(
	    year => $first_dt->year,
	    month => $first_dt->month,
	    day => $first_dt->day,
	    hour => 23,
	    minute => 59,
	    second => 59,
	    time_zone => 'UTC');
    };
}

sub mkdir_p {
    my($path) = '';
    $path = shift @_;
    foreach (@_) {
	$path .= "/" . $_;
	if (! -d $path) {
	    mkdir($path) || die "$prog: cannot mkdir $path\n";
	};
    };
    return $path;
}

#
# new_output: open a file and write the yaml header
#
sub new_output {
    my($metric) = @_;

    infer_period();

    my($open_mode, $open_place) = (">:utf8");
    if ($out_dir) {
	my($yyyy) = sprintf("%04d", $period_start_dt->year);
	my($mm) = sprintf("%02d", $period_start_dt->month);
	my($dd) = sprintf("%02d", $period_start_dt->day);
	my($path) = mkdir_p($out_dir, $yyyy, $mm, $metric);
	$open_place = "$path/$short_service-$yyyy$mm$dd-$metric.yaml";
    };
    if (!defined($open_place) || $open_place eq '-') {
	($open_mode, $open_place) = (">&=:utf8", "1");
    };

    my($out); 
    if (!open($out, $open_mode, $open_place)) {
        die "$prog: cannot open $open_place\n";
    }; 
    print $out "---\n";
    if ($version ne 'v2') {
        print $out "version: rssac002$version\n";
    };
    print $out "service: $service\n";
    print $out "start-period: '" . $period_start_dt->iso8601() . "'\n";
    if ($version eq 'v2') {
	print $out "end-period: '" . $period_end_dt->iso8601() . "'\n";
    };
    print $out "metric: $metric\n";
    return $out;
}

#
# output_mapping
# take counts from rssacfin directly to yaml with renaming
#
# for example, +64 to num-sources-ipv4
#
sub output_mapping($@) {
    my($out, $mapping) = @_;
    # mapping: aref of aref pairs of yaml, rssacfin fields
    foreach (@$mapping) {
	my ($yaml_field, $rssacfin_field) = @$_;
	my($v) = $fin{$rssacfin_field};
 	die "$prog: missing expected rssacfin field $rssacfin_field\n" unless (defined($v));
	print $out "$yaml_field: $v\n";
    };
}

#
# output_bins
# take counts from rssacfin with a prefix, group them into bins
# thus: +4u1:911 and +4u1:921, etc. is mpaped to udp-response-size: 912-927
#
sub output_bins($$$$$$) {
    my($out, $yaml_field, $rssacfin_prefix, $min, $max, $bin_size) = @_;
    die "$prog: assertion failed: output_bins has max $max that is not multiple of bin_size $bin_size\n"
	if (int($max / $bin_size) != ($max + 0.0) / $bin_size);
    my $max_bin = int($max / $bin_size);
    my(@bins) = (0) x ($max_bin + 1);
    my($rssacfin_prefix_re) = quotemeta($rssacfin_prefix);
    my($bins_found) = 0;
    foreach my $key (grep(/^$rssacfin_prefix_re/, keys %fin)) {
	my($size_pre_bin) = ($key =~ /^$rssacfin_prefix_re(\d+)$/);
	my($value) = $fin{$key};
	my $b = int($size_pre_bin /$bin_size);
	$b = $max_bin if ($b > $max_bin);
	$bins[$b] += $value;
	$bins_found++;
    };
    return if ($bins_found == 0);
    print $out "$yaml_field:\n";
    foreach (0..$#bins) {
	next if ($bins[$_] == 0);
        my($start) = $_ * $bin_size;
	my($end) = $start + $bin_size - 1;
	next if ($end < $min);
	my($display_start) = $start < $min ? $min : $start;
	my($display_end) = ($_ == $#bins) ? '' : $end;
	print $out "  ${display_start}-${display_end}: $bins[$_]\n";
    };
}

#
# output_tails
# find all keys that match the prefix and output them
# thus: +51:0 with prefix +51: is mapped to 0 for an rcode
#
sub output_tails($$$;$) {
    my($out, $yaml_field, $rssacfin_prefix, $default_tail) = @_; 
    print $out "$yaml_field:\n"; 
    my($rssacfin_prefix_re) = quotemeta($rssacfin_prefix);
    foreach my $key (sort grep(/^$rssacfin_prefix_re/, keys %fin)) {
	my($tail) = ($key =~ /^$rssacfin_prefix_re(.*)$/);
	if (!defined($tail) || $tail eq '') {
	    if (defined($default_tail)) {
		$tail = $default_tail;
	    } else {
		die "rssacfin_to_rssacyaml: missing tail on $key\n";
	    };
	};
	my($value) = $fin{$key};
	print $out "  $tail: $value\n"; 
    }; 
 }


sub process_file($) {
    my($file) = @_;
    my($open_mode, $open_place) = ("<:utf8", $file);
    if ($file eq '-') {
	($open_mode, $open_place) = ("<&=:utf8", 0);
    };
    my $in;
    if (!open($in, $open_mode, $open_place)) {
	die "$prog: cannot open $open_place\n";
    };
    while (<$in>) {
	chomp;
        if (/^#/) {
            if (/^#fsdb/) {
                die "unexpected schema: $_\n"
                    if ($_ !~ /^$in_schema/);
		next;
            };
            next;
        };
	my($key, $value) = split("\t");
	die ("$prog: cannot parse line $_\n") if (!defined($key) || !defined($value));
	die ("$prog: duplicate key $key in line $_\n") if (defined($fin{$key}));
	die ("$prog: input has rssacint data (like +64:192.0.2.1). reduce it with rssacint_reducer --count-ips\n")
	    if ($key =~ /^\+6[46a]:/);
	$fin{$key} = $value;
    };
    #
    # 2.1 load time
    # 2.2 zone size
    # : cannot be determined from rssacfin data
    #
    # 2.3 traffic-volume
    #
    my($out);
    $out = new_output('traffic-volume');
    output_mapping($out, [ [ 'dns-udp-queries-received-ipv4', '+3u04' ],
			   [ 'dns-udp-queries-received-ipv6', '+3u06' ],
			   [ 'dns-tcp-queries-received-ipv4', '+3t04' ],
			   [ 'dns-tcp-queries-received-ipv6', '+3t06' ],
			   [ 'dns-udp-responses-sent-ipv4', '+3u14' ],
			   [ 'dns-udp-responses-sent-ipv6', '+3u16' ],
			   [ 'dns-tcp-responses-sent-ipv4', '+3t14' ],
			   [ 'dns-tcp-responses-sent-ipv6', '+3t16' ] ]);
    close ($out);

    #
    # 2.4 traffic-sizes
    #
    $out = new_output('traffic-sizes');
    foreach ([ 'udp-request-sizes' , '+4u0:', 288 ],
	     [ 'udp-response-sizes' , '+4u1:', 4096 ],
	     [ 'tcp-request-sizes' , '+4t0:', 288 ],
	     [ 'tcp-response-sizes' , '+4t1:', 4096 ],
	     [ 'any-request-sizes' , '+40:', 288 ],
	     [ 'any-response-sizes' , '+41:', 4096 ]) {
        my($yaml_field, $rssacfin_prefix, $max) = @$_;
        output_bins($out, $yaml_field, $rssacfin_prefix, 16, $max, 16);
    }
    close($out);

    #
    # 2.5 rcode-volume
    #
    $out = new_output('rcode-volume');
    my($queries) = $fin{'+3u04'} + $fin{'+3u06'} + $fin{'+3t04'} + $fin{'+3t06'};
    if (defined($fin{'+50'})) {
        # Backwards compatibility: early versions of message_to_rssacint
	# summed ALL rcodes, not just response rcodes.  We correct for that
	# by assuming all queries were rcode 0 and adjusting down by the number of requests.
	# This bug was fixed 2016-05-25.
	die "rssacfin_to_rssacyaml: cannot process rssacint with mixed old and new rcodes (are you B on 2016-05-27?).\n"
	    if (defined($fin{'+51:0'}));
	$fin{'+50'} -= $queries;
        output_tails($out, 'rcodes', '+5');
    } else {
	# New world order: the input splits out rcodes for queries and replies.
        output_tails($out, 'rcodes', '+51:');
    };
    close ($out);

    #
    # 2.6 sources
    #
    $out = new_output('unique-sources');
    output_mapping($out, [ [ 'num-sources-ipv4', '+64' ],
    		           [ 'num-sources-ipv6', '+66' ],
    		           [ 'num-sources-ipv6-aggregate', '+6a' ] ]);
    close($out);

    #
    # bonus stats
    #
    $out = new_output('extra');
    if (defined($fin{'=rfileseqno'})) {
	# Old-style was missing a site suffix (sigh, only one site).
        output_tails($out, 'fileseqno', '=rfileseqno', 'site');
    } else {
        output_tails($out, 'fileseqno', '=rfileseqno:');
    };
    output_mapping($out, [ [ 'start-time', '<ts' ],
    		           [ 'end-time', '>te' ] ]);
    my $duration = $fin{'>te'} - $fin{'<ts'};
    printf $out "duration\t%.3f\n", $duration;
    my $queries_per_second = ($queries * 1.0) / $duration;
    printf $out "queries-per-second\t%.2f\n", $queries_per_second;
    close($out);
};
    
push (@ARGV, "-") if ($#ARGV == -1);
foreach (@ARGV) {
    process_file($_);
};

exit 0;

=head1 SEE ALSO

L<dnsanon(1)>,
L<message_to_rssacint(1)>,
L<rssacint_reduce(1)>,
L<rssacfin_to_rssacyaml(1)>


=head1 AUTHOR and COPYRIGHT

This program was written by John Heidemann.

Copyright (C) 2016 University of Southern California.

This program is distributed under terms of the GNU general
public license, version 2.  See the file COPYING
with the distribution for details.

=cut


