#! /usr/bin/env perl
# Copyright © 2025 Étienne Mollier <emollier@emlwks999.eu>
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
use strict;
use warnings;
use POSIX();

# sum takes a list of numerical values in input and returns a scalar
# representing the total sum of all the values.
sub sum {
	my @values = @_;
	my $total = 0;
	$total += $_ foreach (@values);
	return $total;
}

# occurrences takes a scalar and a list in input and returns the number
# of occurrences of the scalar in said list.
sub occurrences {
	my $elt = shift;
	my @arr = @_;
	my $occurrences = 0;
	foreach (@arr) {
		$occurrences += 1 if ($elt == $_);
	}
	return $occurrences;
}

# get_blksize takes a string storing a file or directory name in input,
# and returns the favored block size for handling that file.
sub get_blksize {
	my $filename = shift;
	open my $fh, "<", $filename
		or die "error: opening $filename to get block size: $!";
	my $blksize = (stat $fh)[11];
	close $fh;
	return $blksize;
}

# get_carnet_files takes a directory in argument and returns a list of
# files matching the ones that are likely to be part of the carnet.
sub get_carnet_files {
	my $directory = shift;
	opendir my $dh, $directory
		or die <<~END;
		   error: opening $directory to get carnet files: $!
		   END
	my @files = grep /^[0-9]{4}-[0-9]{2}.txt$/, readdir($dh);
	unless (@files) {
		warn "warning: no carnet files found in $directory\n";
	}
	closedir $dh;
	return @files;
}

# count_entries takes one file handle of a carnet week in argument, and
# returns the number of entries that have been counted in the carnet
# week file handle.
sub count_entries {
	my $filehandle = shift;
	my $entries = 0;
	while (readline $filehandle) {
		$entries += 1 if grep /^[0-2][0-9]:[0-5][0-9] ?:/, $_;
	}
	return $entries;
}

# get_carnet_dir implements the logic of environment variables or XDG
# directory in order to locate the directory in which carnet weeks are
# supposed to be stored.  The CARNET_DIR environment variable takes
# precedence over the XDG directory, which takes precedence over some
# default location.
sub get_carnet_dir {
	my $carnet_dir = "";
	if ($ENV{CARNET_DIR}) {
		$carnet_dir = $ENV{"CARNET_DIR"};
	} else {
		if ($ENV{USER} eq "root") {
			$carnet_dir = "/var/log/carnet";
			return $carnet_dir;
		}
		$carnet_dir = `xdg-user-dir DOCUMENTS 2>/dev/null`;
		if ($? == -1) {
			$carnet_dir = $ENV{"HOME"}."/Documents/carnet";
			return $carnet_dir;
		}
		chomp $carnet_dir;
		$carnet_dir .= "/carnet";
	}
	return $carnet_dir;
}

# ruler takes a list of ordered values and returns a string representing
# a ruler with one unit per character, long enough to capture the
# maximum value of the list.  The ruler is targeted at measuring
# alignment to the block size, at one character per kibibyte.
#
# Warning: for performance reasons, the values are assumed to be already
# sorted in increasing order.
sub ruler {
	my $blksize = shift;
	my @values = sort {$a <=> $b} @_;
	my $max = POSIX::ceil($values[$#values] / 1024);
	my $rule = "0";
	my $mark = POSIX::ceil($blksize / 1024);
	my $i;
	for ($i = $mark; $i <= $max; $i += $mark) {
		$rule .= sprintf "%${mark}s", $i;
	}
	if ( $max % $mark != 0 ) {
		$rule .= sprintf "%${mark}s", $i;
	}
	return "$rule";
}

# ascii_histogram takes a scalar block size and a hash mapping file
# names with their size in bytes, and it returns a string representing
# an ordered histogram of all the files and their size.
#
# It is worth noting that the ascii_histogram is primarily targeted at
# counting blocks of memory of 1kiB with alignment on 4kiB pages, and
# thus will represent a mark every four units.
sub ascii_histogram {
	my $blksize = shift;
	my %filesizes = @_;
	my $histogram = "";
	foreach (sort keys %filesizes) {
		my $file = $_;
		my $length = POSIX::ceil($filesizes{$file} / 1024);
		my $bar = "";
		my $i = 1;
		my $block = POSIX::ceil($blksize / 1024);
		while ($i <= $length) {
			if ( $i % $block != 0 ) {
				$bar = $bar."=";
			} else {
				$bar = $bar."|";
			}
			$i = $i + 1;
		}
		# Trimming the .txt extension of the week file.
		my $week = substr $file, 0, -4;
		$histogram .= "$week|$bar\n";
	}
	return $histogram;
}

# boxplot takes a list of ordered numerical values in input and returns
# a string representing the boxplot in ascii art, with on character
# amounting for one unit.
#
# Warning; for performance reasons, values are expected to be sorted
# prior to invocation of the boxplot function.
sub boxplot {
	my @values = @_;
	my @sizes;
	# Normalize raw byte figures to kibibytes.
	foreach (@values) {
		push @sizes, POSIX::ceil $_ / 1024;
	}
	my $count = @sizes - 1;
	my $p01i = POSIX::ceil( 1 * $count / 100);
	my $p05i = POSIX::ceil( 5 * $count / 100);
	my $p10i = POSIX::ceil(10 * $count / 100);
	my $q1i  = POSIX::ceil(25 * $count / 100);
	my $q2i  = POSIX::ceil(50 * $count / 100);
	my $q3i  = POSIX::ceil(75 * $count / 100);
	my $p90i = POSIX::ceil(90 * $count / 100);
	my $p95i = POSIX::ceil(95 * $count / 100);
	my $p99i = POSIX::ceil(99 * $count / 100);
	my $min = $sizes[0];
	my $p01 = $sizes[$p01i];
	my $p05 = $sizes[$p05i];
	my $p10 = $sizes[$p10i];
	my $q1  = $sizes[ $q1i];
	my $med = $sizes[ $q2i];
	my $q3  = $sizes[ $q3i];
	my $p90 = $sizes[$p90i];
	my $p95 = $sizes[$p95i];
	my $p99 = $sizes[$p99i];
	my $max = $sizes[$count];
	# This is the zeroeth character.
	my $output = "|";
	# Effectively drawing the boxplot horizontally.  Be careful, the
	# order of evaluation is important for rendering the result.
	for (my $i = 1; $i <= $max; $i = $i + 1) {
		if    ( $i == $med ) { $output .= "#"; }
		elsif ( $i == $p10 ) { $output .= "|"; }
		elsif ( $i == $q1  ) { $output .= "|"; }
		elsif ( $i == $q3  ) { $output .= "|"; }
		elsif ( $i == $p90 ) { $output .= "|"; }
		elsif ( $i == $max ) { $output .= "."; }
		elsif ( $i == $min ) { $output .= "."; }
		elsif ( $i == $p99 ) { $output .= "o"; }
		elsif ( $i == $p01 ) { $output .= "o"; }
		elsif ( $i == $p95 ) { $output .= "O"; }
		elsif ( $i == $p05 ) { $output .= "O"; }
		elsif ( $i <  $p10 ) { $output .= " "; }
		elsif ( $i <  $q1  ) { $output .= "-"; }
		elsif ( $i <  $med ) { $output .= "="; }
		elsif ( $i <  $q3  ) { $output .= "="; }
		elsif ( $i <  $p90 ) { $output .= "-"; }
		elsif ( $i <  $max ) { $output .= " "; }
	}
	return $output;
}

# decileplot takes a list of ordered numerical values in input and
# returns a string similar to the boxplot in that, that it marks each
# decile on the same scale.
#
# Compared to the boxplot, which is optimized for visualising gaussian
# distributions in one dimension, the decileplot will facilitate viewing
# distinct patterns of activity.
sub decileplot {
	my @values = @_;
	my @sizes;
	# Normalize raw byte figures to kibibytes.
	foreach (@values) {
		push @sizes, POSIX::ceil $_ / 1024;
	}
	my @deciles;
	for (my $i = 0; $i <= 10; $i += 1) {
		@deciles = (
			@deciles,
			$sizes[POSIX::ceil($i * $#sizes / 10)]
		);
	}
	my $output = "|";
	for (my $i = 1; $i <= $deciles[10]; $i += 1) {
		if    (occurrences($i, @deciles) == 0) { $output .= " "; }
		elsif (occurrences($i, @deciles) == 1) { $output .= "-"; }
		elsif (occurrences($i, @deciles) == 2) { $output .= "+"; }
		elsif (occurrences($i, @deciles) == 3) { $output .= "="; }
		else                                   { $output .= "#"; }
	}
	return $output;
}

# percentileplot takes a list of ordered numerical values in input and
# returns a string similar to the decileplot, but with each percentile.
# Combined side by side to the decileplot, it paints an even better
# density view of the activity in the carnet.
sub percentileplot {
	my @values = @_;
	my @sizes;
	# Normalize raw byte figures to kibibytes.
	foreach (@values) {
		push @sizes, POSIX::ceil $_ / 1024;
	}
	my @percentiles = ();
	for (my $i = 0; $i <= 100; $i += 1) {
		push @percentiles, $sizes[POSIX::ceil($i * $#sizes / 100)];
	}
	my $output = "|";
	for (my $i = 1; $i <= $percentiles[100]; $i += 1) {
		if    (occurrences($i, @percentiles) == 0) { $output .= " "; }
		elsif (occurrences($i, @percentiles) == 1) { $output .= "-"; }
		elsif (occurrences($i, @percentiles) == 2) { $output .= "+"; }
		elsif (occurrences($i, @percentiles) == 3) { $output .= "="; }
		else                                       { $output .= "#"; }
	}
	return $output;
}

# full_histogram takes a scalar block size and a map of files and their
# sizes.  It returns a string containing the complete histogram with
# rulers and boxplots representing the sizes of those files.
sub full_histogram {
	my $blksize = shift;
	my %filesizes = @_;
	my $output = "";
	my @values = sort {$a <=> $b} values %filesizes;
	my $ruler = ruler($blksize, @values);
	$output .= "       $ruler\n";
	$output .= ascii_histogram($blksize, %filesizes);
	$output .= "       $ruler\n";
	$output .= "Dist.: " . boxplot(@values) . "\n";
	$output .= "Deci.: " . decileplot(@values) . "\n";
	$output .= "Perc.: " . percentileplot(@values) . "\n";
	$output .= "       $ruler\n";
	return $output;
}

# format_stats takes scalars in argument representing the total apparent
# size of the carnet, the total raw block size occupied by the carnet,
# the number of weeks for which carnet entries have been filled, and the
# number of entries that have been written.  It returns a string storing
# the rendered figures with their descriptions, computations, etc.  All
# these figures are expected to have been processed previously, this
# function is only one of rendering.
sub format_stats {
	my $totalsize = shift;
	my $totalblocks = shift;
	my $weeks = shift;
	my $entries = shift;
	my $output = "";
	$output .= sprintf "Apparent size: %.1fkiB  ",
		$totalsize / 1024;
	$output .= sprintf "raw: %.0fkiB  ",
		$totalblocks / 1024;
	$output .= sprintf "density: %.2f%%\n",
		100 * $totalsize / $totalblocks;
	$output .= sprintf "Mean apparent size per week: %.1fkiB  ",
		$totalsize / $weeks / 1024;
	$output .= sprintf "raw: %.1fkiB\n",
		$totalblocks / $weeks / 1024;
	$output .= sprintf "Number of entries: %d  ",
		$entries;
	$output .= sprintf "of weeks: %d\n",
		$weeks;
	return $output;
}

# truncate_text_width takes a text stored in the first argument and
# limits it to the width specified in second argument.  This is intended
# to truncate output on short terminals and will work by truncating the
# end of lines if the text input is made of several long lines.
sub truncate_text_width {
	my $input = shift;
	my $width = shift;
	my @lines = split /\n/, $input;
	my $output = "";
	foreach(@lines) {
		s/^(.{$width}).*/$1/;
		$output .= $_."\n";
	}
	return $output;
}

# show_activity is the entry point of the carnet --activity option.  It
# takes a string storing the carnet's directory name in input and prints
# out the entire statistics panel: histogram, boxplot, summary, you name
# it.
sub show_activity {
	my $carnet_dir = shift;
	# While keeping bourne shell and perl implementations separate,
	# a full move to perl, for the exercise, is beginning to be
	# tempting at this point in time.  Hence the duplicated handling
	# of the carnet directory detection, when it is not passed
	# automatically by the Bourne Shell script upstream via
	# arguments passing.  Warning is necessary though, as in such
	# scenario, the dedicated perl script for activity would be a
	# temporary measure.
	unless (defined $carnet_dir) {
		warn <<~END;
		warning: $0 is intended to be invoked from:
		         \$ carnet --activity
		         Please refrain from invoking it by hand,
		         as it may disappear in the future
		END
		$carnet_dir = get_carnet_dir();
	}
	unless ( -d $carnet_dir) {
		warn <<~END;
		error: $carnet_dir does not exist yet.
		       Please fill your carnet first.
		END
		exit 1
	}
	my $blksize = get_blksize($carnet_dir);
	my @files = get_carnet_files($carnet_dir);

	unless (@files) {
		warn <<~END;
		error: $carnet_dir does not contain weekly entries yet.
		       Please fill your carnet first.
		END
		exit 1
	}

	my %filesizes;
	my %fileblocks;
	my $entries = 0;
	chdir $carnet_dir
		or die "error: changing directory to $carnet_dir: $!";

	# This section interleaves obtention of various data from all
	# files in one go, and is thus a problem to break into smaller
	# functions without an impact on the overall performances.
	foreach (@files) {
		my $file = $_;
		open(my $filehandle, "<", $file)
			or die "error: opening ${file}: $!";
		my @stat = stat $filehandle
			or die "error: stating ${file}: $!";
		$entries += count_entries($filehandle);
		close $filehandle;
		my $diskuse = $stat[7];
		%filesizes = (
			%filesizes,
			$file => $diskuse,
		);
		%fileblocks = (
			%fileblocks,
			$file => POSIX::ceil($diskuse / $blksize) * $blksize,
		);
	}

	my $totalsize = sum(values %filesizes);
	my $totalblocks = sum(values %fileblocks);
	my $weeks = %filesizes;
	my $histogram = full_histogram($blksize, %filesizes);
	my $stats = format_stats($totalsize, $totalblocks, $weeks, $entries);
	if ( -t 1 ) {
		my @termsize = split / /, qx(stty size);
		# The default value of 400 is to fit within some mail
		# servers default line length limits.
		my $cols = 400;
		if ( defined $termsize[1] ) {
			chomp $termsize[1];
			$cols = $termsize[1];
		}
		$histogram = truncate_text_width($histogram, $cols);
		$stats =  truncate_text_width($stats, $cols);
	}
	print $histogram;
	print $stats;
}

show_activity $ARGV[0];
