#! /usr/bin/perl
use warnings;
use strict;
use integer;
use FindBin;
use lib $FindBin::RealBin;
use Def;

# %ignsct: omit ("ignore") some warnings for these man sections, and do
#          not output man pages belonging to them
our %ignsct           = map {$_=>1} qw( 2 3 );
our $indent           = ' ' x 5;
our $file_debram      = $Def::debram_txt;
our $test             = 0; # set to 1 to run on test data only

our $usage = <<END;
usage: $0 {-t debram.txt} [-jsvh] [Contents file]
    -t debram.txt
           use alternate data file, which need not necessarily be
           a full debram.txt (this option must appear first)
    -j ignore the Contents file's header
    -s append a "Stray Commands" pseudobranch to the output
    -v verbose: report script progress, and files/links/whatises/etc
           the script cannot handle, to stderr (normally recommended)
    -h print this usage message
END

# This relatively unimportant helper script attempts to marshal the
# one-line whatis(1) descriptions by binary package and ramification.
#
# The script's author admits that he does not much like the script.  The
# script remains buggy, yet its relative unimportance discourages the
# author from spending too much more time improving it.  To fix the
# script right would require an understanding of mandb(8) internals
# which the author lacks sufficient time and incentive to gain.  At
# least the script *seems* to work approximately well enough for the
# purpose intended.  (The author regrets writing this script in Perl.
# Wrong language, in retrospect.  Too late now...)
#
# If you are developing Debram and updating cmdsel.txt, you are likely
# to find the script's output interesting nevertheless.  Try it with
# the -sv options, capturing stdout in one file and stderr in another,
# and see what you get.
#
# TESTING THE SCRIPT
#
# Because the Contents file is large and there are many manpage files,
# the script takes some time to run (on the order of one minute on the
# Intel 845G / Pentium 4 machine [*] the author happens to be using at
# the moment).  To avoid such delays when experimenting with the script,
# you can set $test above to 1, then set up a "test" subdirectory in
# your present working directory with the relevant parts of a Debian
# filesystem: test/usr/share/man/ and the like.  In "test"'s top level,
# you must include the special files
#
#     * "whatis", with mock "whatis -w '*'" output;
#     * "manpath", with mock "manpath" output; and
#     * "dpkggs", with mock "dpkg --get-selections" output.
#
# Probably also,
#
#     * "Contents", a mock Contents file (need not be gzipped); and
#     * "debram.txt", a mock debram.txt (of course).
#
# In the latter case you would run the script with the -t option,
# specifying both the mock debram.txt and the mock Contents on the
# command line.  Naturally you would give the -v option, too.
#
# An example of mock manpath output:
#
#     test/usr/local/man:test/usr/share/man:test/usr/X11R6/man
#
# An example line from a mock Contents file:
#
#     usr/share/man/man1/foo.1.gz admin/mypkg
#
# [*] The funny thing about mentioning the machine you happen to be
# using at the moment is: you know that someone will read the mention
# years later and think it quaint.  So, if you are running an
# Intel 7000G / Pentium 7 at a million GHz by the time you read these
# words on your high-definition 3-D immersive hyperspatial display, why,
# beam the author an e-mail back in time to 2006 and tell him what it's
# like.
#
#

# Read command-line arguments and options.
while ( @ARGV >= 2 && ( $ARGV[0] eq '-t' ) ) {
  my( $opt, $file ) = splice @ARGV, 0, 2;
  if    ( $opt eq '-t' ) { $file_debram = $file }
}
my @opt;
my @arg;
push @{ /^-\S/ ? \@opt : \@arg }, $_ for @ARGV;
my %opt = map {
  my $o = $_;
  map { substr( $o, $_, 1 ) => 1 } 1 .. length($o)-1
} @opt;
if ( @arg < 1 || $opt{'?'} || $opt{h} ) {
  print $usage;
  exit 0;
}
my $f_cont       = shift @arg;
!defined($f_cont) || @arg and die $usage;

my $pwd     = `pwd`; chomp $pwd;
my $pat_pwd = quotemeta $pwd;

sub readlink1 ($) {
  local $_ = shift;
  my $link = `readlink -f $_`; chomp $link;
  $link =~ s/^$pat_pwd\///
    or die
    "$0: test link is from outside the present working directory\n"
    if $test;
  return $link;
}

sub vwarn { warn @_ if $opt{v} }

our $bar         = '-' x $Def::width_fmt . "\n";
my  $w_maint_pri = $Def::w_maint + $Def::w_pri;

# ----------------------------------------------------------------------

# Read in.

my %whatis ; # desc       ->> whatis
my %whatis2; # sect & cmd ->> whatis
vwarn "${bar}reading whatis...\n";
{
  for ( $test ? `cat test/whatis` : `whatis -w '*'` ) {
    my( $cmd, $sect, $sext, $desc );
    {
      my $pat;
      {
        my $p1  = qr/(\S+)\s+\((\d)([^()]*)\)/;
        my $p2  = qr/\s(?:.*?\s)??-\s+/       ;
        my $p3  = qr/((?:\S(?:.*\S)?)?)/      ;
        $pat    = qr/^${p1}${p2}${p3}\s*$/o;
      }
      ( $cmd, $sect, $sext, $desc ) = /$pat/o;
    }
    unless ( defined $cmd ) {
      /\s\((\d)([^()]*)\)\s/ && $ignsct{$1}
        or vwarn( "cannot parse whatis\n$_\n" );
      next;
    }
    my %whatis1 = (
      fultxt => $_   ,
      cmd    => $cmd ,
      sect   => $sect,
      sext   => $sext,
      desc   => $desc,
    );
    push @{ $whatis {$desc}       }, \%whatis1;
    push @{ $whatis2{$sect}{$cmd} }, \%whatis1;
  }
  for my $desc ( keys %whatis ) {
    my @whatis = sort {
      $a->{sect} <=> $b->{sect} or
      $a->{cmd } cmp $b->{cmd } or
      $a->{sext} cmp $b->{sext} or
      $a->{desc} cmp $b->{desc}
    } @{ $whatis{$desc} };
    $whatis{$desc} = \@whatis;
  }
}

vwarn "${bar}reading the manpath...\n";
my @manpath = grep { chomp; !/\blocal\b/ }
  split ':', ( $test ? `cat test/manpath` : `manpath` );
my $pat_manpath;
{
  my $p0 = join '|', map { quotemeta } @manpath;
  $pat_manpath = qr/$p0/;
}

my %manpg; # sect & cmd -> actual manpage
vwarn "${bar}reading actual manpage filenames...\n";
{
  my @manpg0  = `find @manpath -type f -mindepth 2 -maxdepth 2`;
  my @manpgl0 = `find @manpath -type l -mindepth 2 -maxdepth 2`;
  my %kill;
  for my $ctl (
    { list => \@manpg0 , islink => ''  },
    { list => \@manpgl0, islink => '1' },
  ) {
    for ( @{ $ctl->{list} } ) {
      chomp;
      my( $cmd, $sect, $sext );
      {
        local $_ = $_;
        s/\.gz$//;
        ( $cmd, $sect, $sext ) =
          /^(?:\S*\/)?([^\/\s]+)\.(\d)([^\/.\s]*?)$/
          or vwarn(
            $ctl->{islink}
            ? "cannot parse actual manpage linkname $_\n"
            : "cannot parse actual regular manpage filename $_\n"
          ), next;
      }
      my $level = 20 * !length($sext) + 10 * ( !$ctl->{islink} + 1 );
      my %manpg1 = (
        path   => $_    ,
        cmd    => $cmd  ,
        sect   => $sect ,
        sext   => $sext ,
        level  => $level,
      );
      if ( $ctl->{islink} ) {
        my $realf = readlink1 $_;
        unless ( $realf =~ /\S/ ) {
          $ignsct{$sect}
            or vwarn "cannot follow the real symlink "
            . "$_ to its target\n";
          next;
        }
        $manpg1{realf} = $realf;
      }
      my $dont_record = '';
      if ( my $existing = $manpg{$sect}{$cmd} ) {
        if    ( $manpg1{level} > $existing->{level} )
          { delete $kill{$sect}{$cmd} }
        elsif ( $manpg1{level} < $existing->{level} )
          { $dont_record       = 1    }
        else
          { $kill{$sect}{$cmd} = 1    }
        $ignsct{$sect} or vwarn "two $cmd ($sect) manpages exist\n";
      }
      $manpg{$sect}{$cmd} = \%manpg1 unless $dont_record;
    }
  }
  for my $sect ( keys %kill ) {
    delete $manpg{$sect}{$_} for keys %{ $kill{$sect} };
  }
}

my %inv_mp; # path (file/link, not realfile) -> sect & cmd
vwarn "${bar}inverting the manpage hash...\n";
for my $sect ( keys %manpg ) {
  for my $cmd ( keys %{ $manpg{$sect} } ) {
    my $file = $manpg{$sect}{$cmd}{path};
    exists $inv_mp{$file}
      and die
      "$0: impossible: two actual manpage files with the same path\n";
    $inv_mp{$file} = {
      sect => $sect,
      cmd  => $cmd ,
    };
  }
}

my %dpkggs; # pkg -> (dummy 1)
vwarn "${bar}reading dpkg --get-selections...\n";
{
  for ( $test ? `cat test/dpkggs` : `dpkg --get-selections` ) {
    my( $pkg ) = /^(\S+)/;
    if ( defined $pkg ) {
      $dpkggs{$pkg} = 1;
    }
    else {
      vwarn "do not understand from dpkg --get-selections\n$_\n";
    }
  }
}

my %contp; # file (cont) ->> pkg
my %contq; # link (cont) ->> pkg
my %contf; # pkg ->> file (cont)
my %contl; # pkg ->> link (cont)
vwarn "${bar}reading the Contents file...\n";
{
  my %realf; # link -> file
  my $isbody = !$opt{j};
  CONTENTS: for (
    $f_cont =~ /\.gz$/ ? `zcat $f_cont` : `cat $f_cont`
  ) {
    if ( !$isbody ) {
      $isbody = '1' if /^FILE\s+LOCATION\s*$/;
    }
    else {
      if ( $test ) { s/^/test\// }
      else         { s/^/\//     }
      /^$pat_manpath\/man\d\//o or next CONTENTS;
      my( $file, $loc ) = /^(\S(?:.*?\S)??)\s+(\S+?)\s*$/
        or vwarn( "cannot parse Contents entry\n$_\n" ),
        next CONTENTS;
      my @loc = grep { $_ eq '->' or $dpkggs{$_} } map {
        my $ret;
        if    ( /^.*\/([^\/\s]+)$/ ) { $ret = $1 }
        elsif ( $_ eq '->'         ) { $ret = $_ }
        elsif ( /[,\s]/ || !/\S/ ) {
          vwarn "cannot parse Contents location $_\n";
          next CONTENTS;
        }
        else                         { $ret = $_ }
        $ret;
      } split ',', $loc;
      for my $loc ( @loc ) {
        if ( $loc eq '->' ) {
          if ( -l $file ) {
            my $realf = readlink1 $file;
            if ( $realf =~ /\S/ ) { $realf{$file} = $realf }
            else {
              vwarn "cannot follow the symlink from $file\n";
            }
          }
        }
        else {
          $contp{$file}{$loc} = 1;
          $contf{$loc}{$file} = 1;
        }
      }
    }
  }
  my %realf_loc; # link ->> pkg
  for my $link ( keys %realf     ) {
    my $realf = $realf{$link};
    if ( $contp{$realf} ) {
      $realf_loc{$link}{$_} = 1 for keys %{ $contp{$realf} };
    }
    else {
      vwarn "can find no package for symlink $link\n";
    }
  }
  for my $link ( keys %realf_loc ) {
    my $file = $realf{$link};
    my @loc  = keys %{ $realf_loc{$link} };
    for ( @loc ) {
      $contp{$file}{$_} = 1;
      $contf{$_}{$file} = 1;
      $contq{$link}{$_} = 1;
      $contl{$_}{$link} = 1;
    }
  }
}

my %contpq; # file/link (cont) ->> pkg
my %contfl; # pkg ->> file/link (cont)
vwarn
  "${bar}in Contents, "
  . "combining the symlinks with the real files...\n";
for my $hash ( \%contp , \%contq  ) {
  for my $file ( keys %$hash ) {
    for my $pkg  ( keys %{ $hash->{$file} } ) {
      $contpq{$file}{$pkg} = $hash->{$file}{$pkg};
    }
  }
}
for my $hash ( \%contf , \%contl  ) {
  for my $pkg  ( keys %$hash ) {
    for my $file ( keys %{ $hash->{$pkg } } ) {
      $contfl{$pkg}{$file} = $hash->{$pkg}{$file};
    }
  }
}

vwarn "${bar}have done reading\n";

# ----------------------------------------------------------------------

# Process.

# Give the manpage files each their several whatis descriptions.
vwarn "${bar}assigning the manpage files each their "
  . "several whatis descriptions...\n";
for my $sect ( keys %manpg ) {
  $manpg{$sect}{$_}{desc} = [] for keys %{ $manpg{$sect} };
}
for my $desc ( keys %whatis ) {
  my $whatis = $whatis{$desc};
  for my $whatis1 ( @$whatis ) {
    my $sect = $whatis1->{sect};
    my $cmd  = $whatis1->{cmd };
    $manpg{$sect}{$cmd} or $ignsct{$sect} or vwarn
      "cannot associate with a manpage file the whatis\n"
      . "$whatis1->{fultxt}\n";
    $manpg{$sect}{$cmd} and push @{ $manpg{$sect}{$cmd}{desc} }, $desc;
  }
}
for my $sect ( keys %manpg ) {
  for my $cmd ( keys %{ $manpg{$sect} } ) {
    my @desc = sort @{ $manpg{$sect}{$cmd}{desc} };
    $manpg{$sect}{$cmd}{desc} = \@desc;
  }
}

# Object if any actual manpage file/link is not in the Contents.
$contpq{$_}
  or vwarn "actual manpage file/link $_ is not in the Contents\n"
  for keys %inv_mp;

# Subroutine: given a binary package name, return all the corresponding
# actual manpage files/links and descriptions (except those in ignored
# man sections).
sub files_descs ($) {
  my( $pkg ) = @_;
  my @file0 = grep { $inv_mp{$_} } sort keys %{ $contfl{$pkg} };
  my @file  = grep { !$ignsct{ $_->{sect} } } map {
    my $sect = $inv_mp{$_}{sect};
    my $cmd  = $inv_mp{$_}{cmd };
    $manpg{$sect}{$cmd}{found} = 1;
    {
      path => $_   ,
      sect => $sect,
      cmd  => $cmd ,
      desc => $manpg{$sect}{$cmd}{desc},
    };
  } @file0;
  return \@file;
}

# Subroutine: given a binary package name, return all the corresponding
# whatis fulltexts.
sub fultxts ($) {
  my( $pkg ) = @_;
  my @fultxt;
  for my $fd ( @{ files_descs $pkg } ) {
    for my $desc ( @{ $fd->{desc} } ) {
      for ( @{ $whatis{$desc} } ) {
        push @fultxt, $_->{fultxt};
        $_->{found} = 1;
      }
    }
  }
  return @fultxt;
}

# ----------------------------------------------------------------------

# Write out.

open  T, '<', $file_debram;

my $is_full_debram_txt;

# Subroutines: get and unget debram.txt lines.
my @ungotline;
sub getline   ( ) { @ungotline ? shift(@ungotline) : scalar(<T>) }
sub ungetline (@) { unshift @ungotline, @_                       }
sub geteof    ( ) { !@ungotline && eof(T)                        }

{

  # Determine if the debram.txt seems a complete one or only a body.
  # If a complete one, then discard the topmatter.
  {
    my @line = ( getline(), getline() );
    if (
      $line[0] eq '-' x $Def::width . "\n" &&
      $line[1] =~ /^\d{${Def::ndig}} \S/
    ) {
      $is_full_debram_txt = '' ;
      ungetline @line;
    }
    else {
      $is_full_debram_txt = '1';
      1 until getline() eq $Def::mark_main_body;
      getline();
    }
  }

  my $state = 1;
  # $state == 0 means that an ordinary debram package entry is expected
  # next.  Other states indicate that the script is in the midst of
  # reading a ram header.
  my $printsave;
  while ( !geteof() ) {
    local $_ = getline();
    if    ( $state == 0 ) {
      if ( !/\S/ ) {
        print;
        ++$state;
      }
      else {
        # Here is an ordinary debram package entry.
        my( $pkg ) = /^.{$w_maint_pri}(\S+)/
          or die "$0: cannot parse debram.txt package entry\n$_\n";
        next unless $dpkggs{$pkg};
        print;
        {
          my %already;
          print map { s/^/${indent}/; $_; } grep {
            my $ret = $already{$_} ? '' : '1';
            $already{$_} = 1;
            $ret;
          } fultxts($pkg);
        }
        print "\n";
      }
    }
    elsif ( $state == 1 ) {
      $printsave = $_;
      ++$state;
    }
    elsif ( $state == 2 ) {
      last unless /^\d{${Def::ndig}} \S/;
      print $printsave, $_;
      ++$state;
    }
    elsif ( $state == 3 ) {
      print;
      $state = 0;
    }
    else { die "$0: impossible" }
  }

}

close T;

# ----------------------------------------------------------------------

# Warn of and/or report on strays.

vwarn "${bar}Finding strays...\n";
{
  if ( $opt{s} ) {
    my $wbar = '-' x $Def::width . "\n";
    print "${wbar}STRAY COMMANDS\n${wbar}";
  }
  for my $desc ( keys %whatis ) {
    for ( @{ $whatis{$desc} } ) {
      unless ( $ignsct{ $_->{sect} } || $_->{found} ) {
        vwarn "stray whatis entry\n$_->{fultxt}\n";
        print "${indent}$_->{fultxt}";
      }
    }
  }
  for my $sect ( sort keys %manpg ) {
    for my $cmd ( sort keys %{ $manpg{$sect} } ) {
      my $manpg1 = $manpg{$sect}{$cmd};
      my $sect   = $manpg1->{sect};
      my $cmd    = $manpg1->{cmd };
      my $path   = $manpg1->{path};
      unless ( $ignsct{$sect} || $manpg1->{found} ) {
        vwarn "stray actual manpage file/link $path\n";
        # (The following does not seem to work, so it is commented out.)
        # if ( $opt{s} ) {
        #   my $found = '';
        #   $_->{found} and $found = '1' for @{ $whatis2{$cmd}{$sect} };
        #   print "${indent}$cmd ($sect)\n" unless $found;
        # }
      }
    }
  }
  if ( $opt{s} ) {
    print "\n\n";
  }
}

