#!/usr/bin/env perl # # A simple Japanese grammer checker using cabocha. # Copyright (c) 2005-2012, Hiroyuki Ohsaki. # All rights reserved. # # $Id: jcorrect,v 1.17 2023/03/30 08:40:31 ohsaki Exp $ # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. use Getopt::Std; use IPC::Open2; use strict; my $MAX_PHRASE_LEN = 60; my $MAX_SENTENCE_LEN = 180; my $DELIM_LEFT = '<<<<'; my $DELIM_RIGHT = '>>>>'; our $line; sub error { my $str = shift; print "$line: **** $str\n"; } sub warning { my $str = shift; print "$line: $str\n"; } sub check_length { my $str = shift; # check sentence length my $len = length($str); error "too long sentence (should be <= $MAX_SENTENCE_LEN chars)" if ($len > $MAX_SENTENCE_LEN); # check phrase length for (split('、', $str)) { my $len = length($_); error "too long phrase (should be <= $MAX_PHRASE_LEN chars)" if ($len > $MAX_PHRASE_LEN); } } sub find_kaku { my ($regexp, $listp) = @_; # find the first occurence of REGEXP in LISTP for (0 .. $#{$listp}) { return $_ if (${$listp}[$_] =~ /$regexp/); } return undef; } sub check_kakari_subj { my $hashp = shift; my @id = sort {$a <=> $b} keys %{$hashp}; my $last_id = $id[-1]; my @list; for my $from (@id) { next if ($hashp->{$from}->{type} eq 'A'); next if ($hashp->{$from}->{type} eq 'P'); push(@list, $hashp->{$from}->{phrase}); } error "misssing subject for `$hashp->{$last_id}->{phrase}'" unless (defined find_kaku('(は|が)$', \@list)); } my %RULES = ( 'している' => 'する', 'について' => 'を', 'など' => '(削除する)', 'を行う' => 'する', 'なる' => '(具体的に)', 'に関して' => '(具体的に)', 'とき' => '時', ); # される/された → 能動態で # ことで/用いる/利用する → あいまい sub check_kakari_verbose { my $hashp = shift; # check inappropriate word my @id = sort {$a <=> $b} keys %{$hashp}; for (@id) { for my $clause (keys %RULES) { error "avoid using `$clause' (instead use `$RULES{$clause}')" if ($hashp->{$_}->{phrase} =~ /$clause/); } } } sub check_kakari_dep { my $hashp = shift; my @id = sort {$a <=> $b} keys %{$hashp}; for my $to (reverse @id) { my @list = (); for my $from (@id) { if ( $hashp->{$from}->{to} == $to and $hashp->{$from}->{type} =~ /^[DO]$/) { push(@list, $hashp->{$from}->{phrase}); } } next unless (@list >= 1); warning( sprintf "check meaning of `%s -> %s'", join('|', @list), $hashp->{$to}->{phrase} ); # check word order my @pos = ( find_kaku('(は|が)$', \@list), find_kaku('を$', \@list), find_kaku('から$', \@list), find_kaku('に$', \@list) ); for my $i (0 .. $#pos) { for my $j ($i + 1 .. $#pos) { next unless (defined $pos[$i] and defined $pos[$j] and $pos[$i] > $pos[$j]); error "reversed word order `$list[$pos[$j]] -> $list[$pos[$i]]'"; } } } } sub check_kakari_parallel { my $hashp = shift; my @id = sort {$a <=> $b} keys %{$hashp}; my %visited; for my $from (@id) { my @list = (); $_ = $from; while ($hashp->{$_}->{type} eq 'P' and !$visited{$_}) { push(@list, $hashp->{$_}->{phrase}); $visited{$_} = 1; $_ = $hashp->{$_}->{to}; } if (@list) { push(@list, $hashp->{$_}->{phrase}); $visited{$_} = 1; } next unless (@list >= 1); warning(sprintf "check meaning of `%s'", join(' = ', @list)); # check and/or style my $is_ng = 0; for (0 .. $#list) { if ($_ != $#list - 1) { $is_ng = 1 if ($list[$_] =~ /(および|または|もしくは)$/); } else { $is_ng = 1 if ($list[$_] !~ /(および|または|もしくは)$/); } } error( sprintf "incorrect enumeration style (%s) (%s)", join('、', @list), "e.g., `A、B、CおよびD'" ) if $is_ng; } } sub check_kakari { my $str = shift; # open and write sentence to cabocha my $pid = open2(*IN, *OUT, 'cabocha -f1'); print OUT $str; close(OUT); # load cabocha's output into hash my $id; my %hash; while () { chomp; next if /^EOS/; if (/^\*\s+\d/) { my ($dummy, $from_id, $to_id, $pos, $val) = split(/\s+/, $_); $id = $from_id; $to_id =~ s/([APDO])//; $hash{$id}->{to} = $to_id; $hash{$id}->{type} = $1; $hash{$id}->{phrase} = ''; } else { next if /未知語/; my ($word, $yomi, $orig, $type, $opts) = split(/\s+/, $_); next if ($type eq '記号-句点'); next if ($type eq '記号-読点'); $hash{$id}->{phrase} .= $word; warning "check referred word/phrase by `$word'" if ($type eq '連体詞'); } } close(IN); check_kakari_subj(\%hash); check_kakari_verbose(\%hash); check_kakari_dep(\%hash); check_kakari_parallel(\%hash); } sub dump_kakari { my $str = shift; my $pid = open2(*IN, *OUT, 'cabocha'); print OUT $str; close(OUT); $str =~ s/、/、\n/g; print "\n$str\n\n"; while () { chomp; next if /^EOS/; print "$_\n"; } close(IN); print "\n"; } my $buf = ''; while (<>) { chomp; # remove item head s/^\s*-\s*//g; # remove citations s/\[\d{5}\]//g; # remove spaces s/^\s+//; s/\s+$//; my $delim = ($ARGV eq '-') ? "$DELIM_LEFT$.$DELIM_RIGHT" : "$DELIM_LEFT$ARGV:$.$DELIM_RIGHT"; if (/^\s*$/) { $buf .= $delim; } else { $buf = $delim unless $buf; s/。/"。$delim"/eg; $buf .= $_; } } for (split(/$DELIM_LEFT/, $buf)) { my $str; ($line, $str) = split($DELIM_RIGHT, $_, 2); next unless $str; # remove annotation $str =~ s/\(.*?\)//g; dump_kakari($str); check_length($str); check_kakari($str); }