package HTML::FormatText;
# ABSTRACT: Format HTML as plaintext
use 5.006_001;
use strict;
use warnings;
# We now use Smart::Comments in place of the old DEBUG framework.
# this should be commented out in release versions....
##use Smart::Comments;
use base 'HTML::Formatter';
our $VERSION = '2.12'; # VERSION
# ------------------------------------------------------------------------
sub default_values {
( shift->SUPER::default_values(),
lm => 3, # left margin
rm => 72, # right margin (actually, maximum text width)
# ------------------------------------------------------------------------
sub configure {
my ( $self, $hash ) = @_;
my $lm = $self->{lm};
my $rm = $self->{rm};
$lm = delete $hash->{lm} if exists $hash->{lm};
$lm = delete $hash->{leftmargin} if exists $hash->{leftmargin};
$rm = delete $hash->{rm} if exists $hash->{rm};
$rm = delete $hash->{rightmargin} if exists $hash->{rightmargin};
my $width = $rm - $lm;
if ( $width < 1 ) {
warn "Bad margins, ignored" if $^W;
if ( $width < 20 ) {
warn "Page probably too narrow" if $^W;
for ( keys %$hash ) {
warn "Unknown configure option '$_'" if $^W;
$self->{lm} = $lm;
$self->{rm} = $rm;
# ------------------------------------------------------------------------
sub begin {
my $self = shift;
$self->{curpos} = 0; # current output position.
$self->{maxpos} = 0; # highest value of $pos (used by header underliner)
$self->{hspace} = 0; # horizontal space pending flag
# ------------------------------------------------------------------------
sub end {
# ------------------------------------------------------------------------
sub header_start {
my ( $self, $level ) = @_;
$self->vspace( 1 + ( 6 - $level ) * 0.4 );
$self->{maxpos} = 0;
# ------------------------------------------------------------------------
sub header_end {
my ( $self, $level ) = @_;
if ( $level <= 2 ) {
my $line;
$line = '=' if $level == 1;
$line = '-' if $level == 2;
$self->out( $line x ( $self->{maxpos} - $self->{lm} ) );
# ------------------------------------------------------------------------
sub bullet {
my $self = shift;
$self->SUPER::bullet( $_[0] . ' ' );
# ------------------------------------------------------------------------
sub hr_start {
my $self = shift;
$self->out( '-' x ( $self->{rm} - $self->{lm} ) );
# ------------------------------------------------------------------------
sub pre_out {
my $self = shift;
# should really handle bold/italic etc.
if ( defined $self->{vspace} ) {
if ( $self->{out} ) {
$self->nl() while $self->{vspace}-- >= 0;
$self->{vspace} = undef;
my $indent = ' ' x $self->{lm};
my $pre = shift;
$pre =~ s/^/$indent/mg;
# ------------------------------------------------------------------------
sub out {
my $self = shift;
my $text = shift;
$text =~ tr/\xA0\xAD/ /d;
if ( $text =~ /^\s*$/ ) {
$self->{hspace} = 1;
if ( defined $self->{vspace} ) {
if ( $self->{out} ) {
$self->nl while $self->{vspace}-- >= 0;
$self->{vspace} = undef;
$self->{hspace} = 0;
if ( $self->{hspace} ) {
if ( $self->{curpos} + length($text) > $self->{rm} ) {
# word will not fit on line; do a line break
else {
# word fits on line; use a space
$self->collect(' ');
$self->{hspace} = 0;
my $pos = $self->{curpos} += length $text;
$self->{maxpos} = $pos if $self->{maxpos} < $pos;
# ------------------------------------------------------------------------
sub goto_lm {
my $self = shift;
my $pos = $self->{curpos};
my $lm = $self->{lm};
if ( $pos < $lm ) {
$self->{curpos} = $lm;
$self->collect( " " x ( $lm - $pos ) );
# ------------------------------------------------------------------------
sub nl {
my $self = shift;
$self->{curpos} = 0;
# ------------------------------------------------------------------------
sub adjust_lm {
my $self = shift;
$self->{lm} += $_[0];
# ------------------------------------------------------------------------
sub adjust_rm {
shift->{rm} += $_[0];
=for test_synopsis 1;
=for stopwords latin1 leftmargin lm plaintext rightmargin rm CPAN homepage
=head1 NAME
HTML::FormatText - Format HTML as plaintext
=head1 VERSION
version 2.12
use HTML::TreeBuilder;
$tree = HTML::TreeBuilder->new->parse_file("test.html");
use HTML::FormatText;
$formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 50);
print $formatter->format($tree);
or, more simply:
use HTML::FormatText;
my $string = HTML::FormatText->format_file(
leftmargin => 0, rightmargin => 50
HTML::FormatText is a formatter that outputs plain latin1 text. All character
attributes (bold/italic/underline) are ignored. Formatting of HTML tables and
forms is not implemented.
HTML::FormatText is built on L<HTML::Formatter> and documentation for that
module applies to this - especially L<HTML::Formatter/new>,
L<HTML::Formatter/format_file> and L<HTML::Formatter/format_string>.
You might specify the following parameters when constructing the formatter:
=over 4
=item I<leftmargin> (alias I<lm>)
The column of the left margin. The default is 3.
=item I<rightmargin> (alias I<rm>)
The column of the right margin. The default is 72.
=head1 SEE ALSO
See perlmodinstall for information and options on installing Perl modules.
You can make new bug reports, and view existing ones, through the
web interface at L<>.
The project homepage is L<>.
The latest version of this module is available from the Comprehensive Perl
Archive Network (CPAN). Visit L<> to find a CPAN
site near you, or see L<>.
=head1 AUTHORS
=over 4
=item *
Nigel Metheringham <>
=item *
Sean M Burke <>
=item *
Gisle Aas <>
This software is copyright (c) 2015 by Nigel Metheringham, 2002-2005 Sean M Burke, 1999-2002 Gisle Aas.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.