package TKL::OAI;

## $Id: OAI.pm,v 1.4 2004/01/06 14:22:40 sondberg Exp $

use LWP::UserAgent;
use HTTP::Request;
use XML::Parser;
use Carp;
use strict;
use vars qw(@ISA);

@ISA = qw(LWP::UserAgent);

my $my_oai = '_my_oai';
my $debug = 0;
my $xml_header = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>";
my $resumptionToken = "resumptionToken";


sub oai_option {
## Sets OAI options, expects syntax key1 => value1, key2 => value2, etc.

    my ($self, %options) = @_;

    foreach (keys %options) {
	$self->{oai_options}->{$_} = $options{$_};
    }
}


sub store_obj {
## Stores object $obj into a referenced hash, and checks if the key is already taken.
    
    my ($storage, $obj) = @_;
    
    if (defined($storage->{$my_oai})) {
	die "$0: Naughty you, overriding existing data member '$my_oai'";
    }
    $storage->{$my_oai} = $obj;
}


sub get_obj {
## Extracts stored object from referenced hash.
    
    my $storage = shift;

    return $storage->{$my_oai};
}


sub set_callback {
## Sets a callback for an element using syntax element => \&my_callback
## At this point, only one element/callback pair can be assigned.

    my ($self, %handles) = @_;
    
    foreach (keys %handles) {
	my $handle = { callback	=> $handles{$_}, seen => 0 };
	$self->{handles}->{$_} = $handle;
    }
}


sub get_callback {
## Returns callback code reference for $element.

    my ($self, $element) = @_;

    return $self->{handles}->{$element}->{callback};
}


sub incomplete {
## Returns the resumptionToken if document is incomplete, otherwise undef.
    
    my $self = shift;
    return $self->{$resumptionToken};
}


sub cursor {
## Returns record offset cursor in case of incomplete document retrieval

    my $self = shift;
    return $self->{cursor};
}


sub completeListSize {
## Returns the total number of records to be returned if incomplete document

    my $self = shift;
    return $self->{completeListSize};
}


sub expirationDate {
## Returns expirationDate of resumptionToken (ehh, at least this is my best guess).

    my $self = shift;
    return $self->{expirationDate};
}


sub get_elements {
## Returns a list of elements with callback.

    my $self = shift;
    return keys %{$self->{handles}};
}


sub harvested {
## Returns number of harvested records

    my $self = shift;
    return $self->{harvested};
}


sub reset_xml {
## Resets XML buffer

    my ($self) = @_;

    $self->{raw_xml} = "";
}


sub store_xml {
## Store a chunk of XML by reference

    my ($self, $xml_ref) = @_;

    $self->{raw_xml} .= $$xml_ref;
}
    

sub get_xml {
## Returns the XML buffer

    my ($self) = @_;

    return $self->{raw_xml};
}


sub oai_request {
## Send an OAI request to the OAI server specified in option url
## The request is specified as arguments of the form key1 => value1, key2 => value2, etc.

    my ($self, @settings) = @_;
    my @old_settings = @{$self->{old_settings} || []};
    my %oai = (@old_settings, @settings);
    my $options = $self->{oai_options};
    my $url = "";
    my $callback = $options->{callback} || \&callback;
    my $chunk_size = $options->{chunksize} || 4096;
    my $parser = new XML::Parser(Handlers => { Start => \&handle_start,
					       End   => \&handle_end,
					       Char  => \&handle_char });
    
    if (!defined($options->{url})) {
	croak "oai_request: You must set OAI option url before calling oai_request";
    }
    if (!defined($oai{verb})) {
	croak "oai_requst: Verb must be specified in OAI request";
    }
    if (defined($self->{$resumptionToken})) {
	$oai{$resumptionToken} = $self->{$resumptionToken};
	delete ($oai{metadataPrefix});
    }
    $self->{$resumptionToken} = undef;
    $self->{expirationDate} = undef;
    $self->{cursor} = undef;
    $self->{completeListSize} = undef;
    $self->{incomplete} = undef;
    $self->{old_settings} = [%oai];
    foreach my $key (keys %oai) {
	if ($url) {
	    $url .= "&";
	} else {
	    $url = "?";
	}
	$url .= "$key=" . $oai{$key};
    }
    $url = $options->{url} . $url;
    #print "URL='$url'\n";
    my $request = new HTTP::Request('GET', $url);
    if (defined($request->{$my_oai})) {
	warn "$0: Naughty you, overriding existing data member '$my_oai'";
    }
    store_obj($request, $self);
    store_obj($parser, $self);
    $self->{chunk_parser} = $parser->parse_start();
    return $self->request($request, $callback, $chunk_size);
}


sub get_orig_request {
## Extracts the HTTP request based on the HTTP response.

    my $r = shift;

    while (my $prev = $r->previous()) {
	$r = $prev;
	#if  ($prev->is_redirect) {
	#    print "Response redirect\n";
	#}
    }
    return $r->request();
}


sub callback {
## This callback is called for each chunk of fetched data
    my ($data, $response, $protocol) = @_;

    my $oai = get_obj(get_orig_request($response));
    my $chunk_parser = $oai->{chunk_parser};

    $chunk_parser->parse_more($data);
}



sub handle_start {
## Expat element start handler

    my ($expat, $element, %attr) = @_;
    my $oai = get_obj($expat);
    my $xml = "<$element";
    my $stack = $oai->{stack};
    if ($oai->{triggered}) {
	if ($debug) {
	    print "Pushing '$element'\n";
	}
	push @$stack, $element;
	if (%attr) {
	    foreach (keys %attr) {
		$xml .= " $_=\"" . $attr{$_} . "\"";
	    }
	}
	$xml .= ">";
	$oai->store_xml(\$xml);
    } elsif ($oai->{handles}->{$element}) {
	$oai->{triggered} = 1;
	$stack = $oai->{stack} = [];
	if ($debug) {
	    print "Resetting at tag '$element'\n";
	}
	$oai->reset_xml();
	handle_start($expat, $element, %attr);
    } elsif ($element eq $resumptionToken) {
	$oai->{incomplete} = 1;
	$oai->{cursor} = $attr{cursor};
	$oai->{completeListSize} = $attr{completeListSize};
	$oai->{expirationDate} = $attr{expirationDate};
    } else {
	if ($debug) {
	    print "Neglecting tag '$element'\n";
	}
    }
}


sub handle_end {
## Expat element end handler

    my ($expat, $element) = @_;
    my $oai = get_obj($expat);
    my $xml = "</$element>";
    my $stack = $oai->{stack};

    if ($oai->{triggered}) {
	unless ($element eq pop @$stack) {
	    print "Non mismatching begin-/end-tag in line " . $expat->current_line();
	} 
	$oai->store_xml(\$xml);
	unless (@$stack) {
	    if (my $callback = $oai->get_callback($element)) {
		if ($debug) {
		    print "Flushing\n";
		}
		&$callback($oai, "$xml_header\n".$oai->get_xml());
		$oai->{harvested} ++;
	    }
	    $oai->{triggered} = 0;
	}
    }
}


sub handle_char {
## Expat character handler

    my ($expat, $str) = @_;
    my $oai = get_obj($expat);
    
    $str =~ s/</&lt;/;
    $str =~ s/>/&gt;/;
    $str =~ s/&amp;/&/;
    $str =~ s/&/&amp;/;
    $str =~ s/^\s+//;
    $str =~ s/\s+$//;
    if ($oai->{triggered}) {
	$oai->store_xml(\$str);
    } elsif ($oai->{incomplete} && !defined($oai->{$resumptionToken})) {
	$oai->{$resumptionToken} = $str;
    }
}


1;

__END__

=head1 NAME

TKL::OAI - Perl package with methods helping you implementing a simple OAI harvester.

=head1 SYNOPSIS

  use TKL::OAI;

  my $oai = new TKL::OAI;
  $oai->oai_option(url => "http://some.oai.repository.org/");
  $oai->set_callback(record => \&handle_record);
  
  my $r = $oai->oai_request(verb => 'ListRecords', metadataPrefix => 'oai_dc');

  while ($r->is_success) {
      last unless $oai->incomplete;
      $r-> = $oai->oai_request;
  }

  sub handle_record {
      my ($self, $raw_xml) = @_;

      ## Handle single OAI record
  }
  				

=head1 DESCRIPTION

This Perl package is a toolbox for constructing an OAI harvester
in pure Perl. The OAI protocol is designed to allow 2 or more
archives to exchange data via HTTP/XML. The server side - or to
use OAI chargon, the OAI repository side, is in TKL context 
implemented in PHP and is integrated with the web-server. Read
more about this at

  http://www.indexdata.dk/tkl/doc/oai.tkl#oairep

Constructing an OAI harvester is a question about setting up
a HTTP robot client that send OAI requests - or in OAI chargon
"verbs" - to the OAI repository. OAI requests are encoded as
URIs, i.e. for instance as

  http://my.oai.repository.org/?verb=Identify
  http://my.oai.repository.org/?verb=ListRecords&metadataPrefix=oai_dc

=head2 Methods

These are the methods contained in this class:

=head3 oai_option

Set various OAI options, for instance the OAI repository URL. The
syntax is this:

  $oai->oai_option(key1 => 'value1', key2 => 'value2');

=head3 set_callback

Register an XML element callback function. Use the following syntax:

  $oai->set_callback(element_name => \&callback_routine);

This associates the function callback_routine with the XML element
'element_name'. Having registered this association, the callback
routine will be called every time an element with this name is seen.
For the moment, only one such (element name, callback routine) pair
can be registered. The callback function has to have the following prototype:

  sub callback_routine {
      my ($self, $raw_xml) = @_;
      
      ## Handle the record
  }

The $self parameter is a blessed reference of the type TKL::OAI, and
$raw_xml is a buffer with the XML code inside the 'element_name' tag.

It's your own job to do something meaningful with the XML!

=head3 get_callback

Method to get the referenced callback routine as a function of the
element name. The syntax is this:

  my $code_reference = $oai->get_callback("my_element_name");

where the routine pointed to by $code_reference can be invoked for
instance by

  &$code_reference($oai, '<?xml version="1.0"?>.......');

=head3 incomplete

When OAI data sets are rather large, an OAI repository can choose
(bot does not have to) return an incomplete subset accompanied by
a so-called resumptionToken pointing the the next subset. This
feature is referred to as Flow-control and is nicely described at

  http://www.openarchives.org/OAI/openarchivesprotocol.html#FlowControl

The TKL::OAI package for OAI harvesting supports flow-control. When
a HTTP response has been received, you should check whether the
returned data set is complete - or if more is yet to come. Use the
following syntax:

  my $resumptionToken = $oai->incomplete;

If the returned data set was incomplete, the resumptionToken is
returned, otherwise undef is returned, in which case the OAI harvesting
task is finished.

In theory, you don't need to make book-keeping with resumptionTokens.
Just remember to check if the returned data set is complete and call
the oai_request method successively (see below) if it was incomplete.

=head3 cursor

In case the returned data set is incomplete, this method returns the
current record offset:

  my $offset = $oai->cursor;

=head3 completeListSize

If the returned data set is incomplete, this method returns the total
number of records in the complete data set:

  my $total_number = $oai->completeListSize;

=head3 expirationDate

For incomplete data sets, the expiration date for the resumptionToken
can be queried this way:

  my $expire = $oai->expirationDate:

=head3 get_elements

Returns a list of elements associated with a callback:

  my @elements = $oai->get_elements;

=head3 harvested

Get a the number of successfully harvested OAI records:

  my $number = $oai->harvested;

=head3 reset_xml

Internal function used by the state-machine to reset collected XML
code. Please don't call this method from outside the TKL::OAI package:

  $oai->reset_xml;

=head3 store_xml

Internal method to store a chunk of XML data (by reference). Don't
call this method unless you know exactly what you're doing:

  $raw_xml = "<test><hello/></test>";
  $oai->store_xml( \$raw_xml );

=head3 get_xml

Internal method to return the collected XML data (by reference). The
syntax is:

  my $raw_xml = ${ $oai->get_xml };

=head3 oai_request

Send an OAI request to the OAI repository (server) specified by the
oai_option 'url'. The OAI parameters should be specified as comma
separated key => value pairs, for instance

  my $http = $oai->oai_request(verb		=> 'ListIdentifiers',
 			       metadataPrefix	=> 'oai_dc');

where the returned $http variable is an object of type HTTP::Response.

NB: Each such call to the method oai_request should always be followed
by a data set incomplete check. The whole cycle can be expressed with
the following idiom:

  ## Get a chunk of OAI records
  my $http = $oai->oai_request(verb		=> 'ListIdentifiers',
 			       metadataPrefix	=> 'oai_dc');

  while ($http->is_success) {
      
      ## Terminate the loop if we're done
      last unless $oai->incomplete;

      ## Get the next chunk of data and reuse first OAI request
      $http = $oai->request;
  }

This piece of code handles both the situation, where the OAI server
returns all the records in one big chunk, and the flow controlled
version, where the the total amount of data is separated into
multiple chunks transmitted over multiple HTTP sessions.


=head1 AUTHOR

Anders Sønderberg Mortensen <sondberg@indexdata.dk>
Indexdata
2004/01/06

=head1 SEE-ALSO

See manual pages for the TKL::* packages, not to forget HTTP::* and
LWP::UserAgent.

=cut
