#!/usr/bin/env perl # # Remove all TeX and LaTeX commands from a plain text. # Copyright (C) 1996-2012 Hiroyuki Ohsaki. # All rights reserved. # # $Id: tex2txt,v 1.23 2023/03/30 08:43:34 ohsaki Exp $ # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # usage: tex2txt [-lx] txt-file use File::Basename; use Getopt::Std; use strict; sub usage { my $prog = basename($0); die <{$label++} = $1; } my @buf = split(/\n/, $buf); my ($eqnarray, $delim, $figure, $table, $verbatim); for (@buf) { # section s/\\(sub)?(sub)?section\*?{([^\}]+)}/* $3/; # label s/\\label\{[^\}]+\}//g; # reference while (/\\ref\{[^\}]+\}/) { s/(\\ref\{[^\}]+\})/$label/; $hashp->{$label++} = $1; } # citation while (/\\cite\{[^\}]+\}/) { s/(\\cite\{[^\}]+\})/[$cite]/; $hashp->{$cite++} = $1; } # enumerate, itemize s/\\(begin|end){(enumerate|itemize)}//; # item s/\\item/- /; # verb s/\\verb\|([^\|]+)\|/SYMBOL/g; # white space s/[~ ]/ /g; # newline s/\\[\\ ]//g; # quotation s/\'\'/\"/g; s/\`\`/\"/g; # hypehn s/-{2,}/-/g; # eqnarray if (/\\begin\{eqnarray\*?\}/) { $eqnarray = 1; $delim = ''; } if ($eqnarray) { $delim = $1 if /([.,;])\s*$/; if (/\\end\{eqnarray\*?\}/) { $eqnarray = 0; $_ = "EQUATION$delim\n"; } else { $_ = ''; } } # figure if (/\\begin\{figure\}/) { $figure = 1; } if ($figure) { $figure = 0 if (/\\end\{figure\}/); if (/\\caption\{([^\}]+)\}/) { $_ = $1; } else { $_ = ''; } } # table if (/\\begin\{table\}/) { $table = 1; } if ($table) { $table = 0 if (/\\end\{table\}/); if (/\\caption\{([^\}]+)\}/) { $_ = $1; } else { $_ = ''; } } # verbatim if (/\\begin\{verbatim\}/) { $verbatim = 1; } if ($verbatim) { $verbatim = 0 if (/\\end\{verbatim\}/); $_ = ''; } # other commands s/\\\w+\*?(\[[^\]]+\])?({.*})?\s*//g; s/{([^\}]+)}/$1/g; # delete space before/after Japanese characters s/([^*-])\s*([\x80-\xff])/$1$2/g; s/([\x80-\xff])\s*/$1/g; # delete space around punctuation s:\s*(/)\s*:$1:g; print "$_\n"; } } sub dump_labels { my $hashp = shift; print "\n__END__\n\n"; for (sort keys %{$hashp}) { $hashp->{$_} =~ s/\n//g; print "$_\t$hashp->{$_}\n"; } } our ($opt_l, $opt_x); getopts('lx') || usage; my @buf = <>; if ($opt_x) { extract_labels(\@buf); } else { my %hash; tex2txt(\@buf, \%hash); dump_labels(\%hash) if $opt_l; }