Friday, November 16, 2012

Generic perl script for parsing n number of different HTML pages at one go

Problem- Create a generic script for parsing the tabled HTML pages. (A single script should parse N number of different HTML pages which resides in different directory)

 

Solution- There will be N number of html files in one directory and corresponding configuration files in another directory (may be the same). The configuration file will define the rules on which basis generic_extract.pl will parse the html files.

To run generic_extract.pl syntax is below.
perl generic_extract.pl html_file_dir conf_file_dir

The arcane detail of HTML specific config file WHICH WILL TELL HOW TO PARSE html is as below.
##################################################################################
##This is html extract configuration file with following specifications.
## if exists at the beginning of the file
## ***  -  it will signify the beginning of configuration entry for a table
## $$$  -  it will signify the end of the configuration entry for a table
## Within *** and $$$ we  will specify below information which will denote the configuration entry for the
## table. The sequence of these entries will be below.
## line 1- table depth|table number
## line 2- Table name
## line 3- number of table rows to be skipped at top during data extraction
## line 4- index of the columns to be extracted (LIST)
## line 5- additional name for each column index (LIST)
## line 6- Column Index making the record name (LIST)
## line 7- string to be substituted in value; format- word |substitute #word|substitute (LIST)
## line 8- string to be substituted in argument name;format- word |substitute#word|substitute (LIST)
## Word and substitute can be regular - expression.
##################################################################################

#!/usr/bin/perl
use HTML::TableExtract;                    
use File::Basename;
use HTML::Parser;
unless((scalar @ARGV == 2))
    { die "uses - perl generic_extract.pl html_dir conf_dir\n";}
my $html_files_dir=shift;
my $conf_file_dir=shift;
########READ ALL CONFIG FILE AND FOR EACH ROW DO PARSEING##############
my $table_depth,$table_num,$table_name;
my @extract_col_index,@extract_col_name,@col_mking_name,@value_substitutions,$timestamp;
          while (my $html_file = glob "$html_files_dir/*")   ##processing each file of html directory
            {
                ##get name of file without any extension
        $html_filename=(split(/\./,((split(/\//,$html_file))[-1])))[0];    
                ##creating absolute name of conf file
        $conf_file=$conf_file_dir."/".$html_filename.".conf";              
                ##Read html file
        open (FILE,"<$html_file") or die "Can't open file to read- $!"; 
                        #read file into a string
                        $html_string='';                
             ## Read entire HTML file into an string for parsing the HTML;
            ## also for improving performance
                        while(<FILE>)
                                {
                                $_=~s/\=3D/\=/g;
                                $_=~s/\r\n//;
                                if(/\=$/)
                                        {
                                        chomp($_);
                                        s/\=$//;
                                        }
                                $html_string.=$_;
                                }
 
                ##now read each html table extract conf file extract table data on the basis of setting
                        open (CONF_FILE,"<$conf_file") or die "Error in opening $conf_file_dir -- $!";
                        my @conf_file_array=<CONF_FILE>;
                        for(my $i =0;$i<=$#conf_file_array;)
                           {
                                if($conf_file_array[$i]=~/^\*\*\*/)
                                {
                                ##Check whether HTML specific config file is a valid file of not
                                unless($conf_file_array[$i+9]=~/^\$\$\$/)
                                        {
                                        warn "CONFIG-FILE ERROR---$conf_file is not proper. Please check for the same\n";
                                        last;
                                        }
                                chomp(($table_depth,$table_num)=split(/\|/,$conf_file_array[$i+1]));
                                   ##Take different parameters into an array
                   chomp($table_name=$conf_file_array[$i+2]);
                                chomp($skip_rows=$conf_file_array[$i+3]);
                                @extract_col_index=split(/\|/,$conf_file_array[$i+4]);
                                @extract_col_name=split(/\|/,$conf_file_array[$i+5]);
                                @col_mking_name=split(/\|/,$conf_file_array[$i+6]);
                                @value_substitutions=split(/#/,$conf_file_array[$i+7]);
                                @argument_substitutions=split(/#/,$conf_file_array[$i+8]);
                                $i+=7;
                            ###Now process the HTML file for each of above entry
                                ##get the table of the HTML to be parsed  
                $te = HTML::TableExtract->new(depth => $table_depth, count => $table_num);
                                $te->parse($html_string);      ##parse the HTML and get the table specified
                                ##Parameter name will be formed with tablename+Some_cell_names
                ##+Column heading; we are getting tablename from conf file
                                my ($midname,$colheading);  
                                foreach $ts ($te->tables)
                                 {
                                  my $row_count=0;
                                   foreach $row ($ts->rows)
                                     {
                                        $midname='';
                                    ##skip the rows as mentioned in conf file
                                        if($row_count<$skip_rows)
                                            {
                                            $row_count++;
                                            next;
                                            }
                                    ## get the middle name of the parameter       
                           foreach my $j (@col_mking_name)
                                               {
                                                if($j=~/\d+/)
                                                        {
                                                        $midname.=$$row[$j];
                                                        }
                                               }
                               ##For each column index there will be common tablename and middle name
                               ##Now we are processing for each column index from where we need to extract value
                               ##and generating the last name too. filename variable will contain the entire
                               ## parameter name and value will contain the value of that parameter
                                           foreach $j (0..$#extract_col_index)
                                               {
                                                #print "\n\n".$midname."\n\n";
                                                unless ($midname=~/^\s*$/ || $midname=~/^\./)
                                                        {$midname=~s/^/\./;}
                                                $lastname=$extract_col_name[$j];
                                                $lastname=~s/^/\./ unless $lastname=~/^\s*$/;
                                                if($table_name=~/^\s*$/)
                                                        {
                                                        $filename=$midname.$lastname;
                                                        $filename=~s/^.//;
                                                        }
                                                else
                                                        {
                                                        $filename=$table_name.$midname.$lastname;
                                                        }
                                                $filename=~s/^\s+|\s+$//;
                                                $filename=~s/\s+//g;
                                                $value=$$row[$extract_col_index[$j]];
                               ##After getting filename and value we will do processing for possible substitution
                               ## defined in conf file  
                               ##We are doing value substitution as instructed in conf file
                        foreach $str (@value_substitutions)
                                                        {
                                                        my ($original,$replace)=split(/\|/,$str);
                                                        $value=~s/$original/$replace/g;
                                                        }
                               ##We are doing filename substitution as defined in conf file   
                        foreach $str (@argument_substitutions)
                                                        {
                                                        my ($original,$replace)=split(/\|/,$str);
                                                        $filename=~s/$original/$replace/g;
                                                        }
                                                print $filename."\t".$value."\n";
                                                }
                                                $lastname='';
                                                $midname='';
                                        }
                                 }
                               }
                               else
                                {$i++;}
                        }
                }


Suppose there is an HTML file named "test1.html" as below in html_file directory.
<html>
<body>
<table>
<tr>
<th>name</th><th>salary</th><th>age</th>
</tr>
<tr>
<td>abc</td><td>INR 10000</td><td>25</td>
</tr>
<tr>
<td>def</td><td>USD 20000</td><td>50</td>
</tr>
</table>
</body>
</html>

and in config_directory corresponding configuration file named "test1.conf" as below.
***
0|0
Details
1
1|2
Salary|Age
0
INR|$#USD|$#

$$$

then the sample run of above script will generate below output.

Details.abc.Salary    $ 10000
Details.abc.Age    25
Details.def.Salary    $ 20000
Details.def.Age    50

Output totally depends on config file. For more details of functions please visit http://www.cpan.org/.

Friday, November 9, 2012

Perl pop3 mail client script

Problem- Fetch the mails from MAIL_BOX (like gmail, yahoo etc) get them into your local machine into special year-month-date file-system hierarchy and extract the attachment or body of the mail as specified for each mail.

Solution-
Solution Artifacts-      get_mail.pl
                                    action.cfg

Get_mail.pl script is used to get mails from remote mail_box to your local system and then we can extract the attachments from the mail or can extract some information from body of the mail into a directory specified in action.cfg. The format of action.cfg is as below.

to_cut|Subject_Match|Transfer_data_to|to_file|cut_from|cut_till|encoding

to-cut- It specifies that what type of information we want to cut from the mail. Possible option are - body/attachment
Subject_Match- It specifies the string to be matched in the subject line of the mail. It should be unique. If not the action corresponding to first match will be evaluated. 
Transfer_Attachement_to- directory where the attachment/body or other extracted information need to be placed
To_file-  Only applicable when content type is body. The name of the file to which extracted contents from the body need to be placed cut_from & cut-till-  Only applicable when content type is body. These will signify the starting and end of the content to be extracted from the body
encoding- if explicit encoding mechanism has been applied before attaching the document; then mention it here.


The script will automatically create the directory structure of below type.

Year-->Month-->Date


Get_mail.pl

#!/usr/bin/perl
use warnings;
use Mail::POP3Client;
use MIME::Parser;
use File::Basename;
use File::Copy;
use MIME::Decoder;
my $extract_file=extract_file_temp;
my ($extract_dir,$action,$match,$cutfrom,$cutto)=('')x5;
my $script_path=dirname($0);
if( "$script_path" eq "\." )
{chomp($script_path=`pwd`);}
#################PARAMETER SETTING SECTION############################

my $basedir="$script_path/mailbox";
my $user='Email_ID';
my $password='PASSWORD';
my $host='WEB_MAIL_SERVER';
my $port='MAIL_SERVER_POP3_REQUEST_PORT';
my $config_file='action.cfg';
open LOGREPORT,">>","$script_path/mail_export.log";
my $record='';
#####READ CONFIG FILE INTO ARRAY######################

open ACTION_R,"<",join('/',($script_path,$config_file)) or die "unable to open";
my @action_file_array=<ACTION_R>;
close ACTION_R;
#####DETERMINE MAIL DIRECTORY#######################

if (! -e $basedir )
{mkdir $basedir;}
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time);
chdir $basedir;
$year+=1900;
$mon+=1;
if($mon < 10)
{$mon="0"."$mon";}
if ($mday < 10)
{$mday="0"."$mday";}

if(! (-e $year && -d $year))
        {mkdir $year or die "unable to create directory:: $year.  error-$!";} chdir $year; if (! (-e $mon && -d $mon))
        {mkdir $mon or die "unable to create directory:: $mon.  error-$!";} chdir $mon; if(! (-e $mday && -d $mday))
        {mkdir $mday or die "unable to create directory:: $mday.  error-$!";} chdir $mday; chomp($maildir=`pwd`); my $parser = new MIME::Parser; $parser->ignore_errors(1); $parser->extract_uuencode(1); $parser->tmp_recycling(0); $parser->output_to_core(1);

$pop = new Mail::POP3Client( USER     => $user,
                             PASSWORD => $password,
                             HOST     => $host,
                             PORT => $port
                            ) || die "unable to connect to Microsoft Mail Exchange Server"; for( $msg_num = 1; $msg_num <= $pop->Count(); $msg_num++ ) {
        $record='';
        my ($sec,$min,$hour,$mday,$mon,$year) =localtime(time);
        $unq_key=(1900+$year).($mon+1).$mday.$hour.$min.$sec;
        open $MAIL_W,">",join('/',($maildir,$msg_num));
        $pop->HeadAndBodyToFile($MAIL_W,$msg_num);
        $MAIL_W->close;
        #$pop->Delete($msg_num);
        open MAIL_R,"<",join('/',($maildir,$msg_num));
        my $entity = $parser->parse(\*MAIL_R);
        my $from=$entity->head->get('From');
        chomp($f=((split('[<>]',"$from"))[1]));
        my $subject=$entity->head->get('Subject');
        my $date=$entity->head->get('Date');
        chomp($date=((split('[,+]',$date))[1]));
        $date=~s/^\s+|\s+$//g;
        $date=~s/\s+//g;
######ACTION HANDELING##############################
        chomp($subject=join ' ',(split / +/,$subject));
        foreach (@action_file_array)
        {
        chomp($match=(split(/\|/))[1]);
        if($subject=~m/$match/i)
        {
        chomp($extract_dir=(split(/\|/))[2]);
        chomp($action=(split(/\|/))[0]);
        chomp($bodyfilename=(split(/\|/))[3]);
        chomp($cutfrom=(split(/\|/))[4]);
        chomp($cutto=(split(/\|/))[5]);
        chomp($explecit_encoding=(split(/\|/))[6]);
        last;
        }
        }
##################################################
        $record.=$subject." -- ";
        my @parts=$entity->parts;
        if((scalar @parts == 0) && $action=~/body/)      
#single parted mail;body to be extracted
                {
                open "BODY_R","<","$maildir/$msg_num";
                open "BODY_W",">","$extract_dir/$bodyfilename${date}_${unq_key}";
                        while(<BODY_R>)
                        {
                        if(/$cutfrom/../$cutto/)
                                {
                                print BODY_W $_;
                                }
                        }
                $record.=$bodyfilename.' , ';
                }
        else
        {
        while(my $part = shift(@parts))
                {
                  if($part->parts)
                  {
                        push @parts,$part->parts; # Nested multi-part
                        next;
                  }
                  my $type=$part->head->mime_type || $part->head->effective_type;
                  my $encoding=$part->head->mime_encoding;
                  my $filename=$part->head->recommended_filename;

  #####extract the parts of the mail######################

                        my $io=$part->open("r");
                        open F,">",join('/',($maildir,$extract_file));
                        my $buf='';
                        while($io->read($buf,1024))
                                {
                                print F $buf;
                                }
                        close(F);
                        if($action=~/attachment/ && $filename )
            ## get the attachment
                              {
                                if($explecit_encoding)
                                        {
                                        $decoder = new MIME::Decoder $explecit_encoding;
                                        open DECODER_R,"<",join('/',($maildir,$extract_file));
                                        open DECODER_W,">",join('/',($extract_dir,"${unq_key}_${filename}"));
                                        $decoder->decode(\*DECODER_R, \*DECODER_W);
                                        close(DECODER_R);
                                        close(DECODER_W);
                                        }
                                else
                                        {
                                        copy("$maildir/$extract_file","$extract_dir/${unq_key}_${filename}");
                                        }
                                $record.=$filename;
                                }
                        if($action=~/body/)                       ##get the body
                        {
                                open "BODY_R","<","$maildir/$extract_file";
                                $temp_str='';
                                while(<BODY_R>)
                                        {
                                        if(/$cutfrom/../$cutto/)
                                                {
                                                $temp_str.=$_;
                                                }
                                        unless($temp_str=~/^$/)
                                                {
                                                open "BODY_W",">","$extract_dir/$bodyfilename${date}_${unq_key}";
                                                print BODY_W $temp_str;
                                                }
                                        }
                                $record.=$bodyfilename.${date}."_".${unq_key}."_";
                        }
                        $io->close;
                }
        }
        rename("$maildir/$msg_num","$maildir/$date-${f}_$unq_key");
        $record=$date." -- ".$record."\n";
        print LOGREPORT $record;
        $action='';
        sleep 1;
}
$pop->Close();