Index: ossp-pkg/quos/openpkg-rdf2sql.pl RCS File: /v/ossp/cvs/ossp-pkg/quos/openpkg-rdf2sql.pl,v rcsdiff -q -kk '-r1.3' '-r1.4' -u '/v/ossp/cvs/ossp-pkg/quos/openpkg-rdf2sql.pl,v' 2>/dev/null --- openpkg-rdf2sql.pl 2004/11/08 15:32:59 1.3 +++ openpkg-rdf2sql.pl 2004/11/16 14:05:32 1.4 @@ -1,4 +1,4 @@ -#!/usr/opkg/bin/perl +#!/usr/lpkg/bin/perl ## ## OSSP quos - Query On Steroids ## Copyright (c) 2004 Ralf S. Engelschall @@ -26,28 +26,23 @@ ## require 5.008; +use strict; use Getopt::Long; use XML::Parser; use DBI; use DBD::SQLite; use Data::Dumper; use XML::Simple; +use LWP::UserAgent; +use IPC::Filter qw(); # configure optional debugging $Data::Dumper::Purity = 1; $Data::Dumper::Indent = 1; $Data::Dumper::Terse = 1; -my $xml = new XML::Simple; -my $rdf = $xml->XMLin( - "openpkg.rdf", - KeepRoot => 1, - ForceContent => 0, - ForceArray => 1, -); -undef $xml; -#print Dumper($xml); -#exit(0); +my $ftpserv = 'ftp://anonymous:herb@ftp.openpkg.org/'; +my $starturl = '00INDEX.rdf'; # connect to database my $db = DBI->connect("dbi:SQLite:dbname=openpkg.db", "", "", @@ -85,56 +80,121 @@ " VALUES ((SELECT MAX(pk_id) FROM quos_package),?);" ); -# iterate over XML/RDF data structure -foreach my $repo (@{$rdf->{'rdf:RDF'}->[0]->{'Repository'}}) { - my $rd_url = $repo->{'rdf:resource'}; +$db->begin_work(); - # store repository information - $db->begin_work(); - $sql->{-rdf}->execute($rd_url); - - # interate over all packages in a repository - foreach my $desc (@{$repo->{'rdf:Description'}}) { - # store simple (single-value) properties of a package - my $prop = {}; - foreach my $attr (qw( - Name Version Release Distribution Group License - Packager Summary URL Vendor Description - )) { - $prop->{$attr} = $desc->{$attr}->[0]; +&fetchrdfsrecursendump2db($starturl); + +sub fetchrdfsrecursendump2db { + + my ($url) = @_; + my $ua = new LWP::UserAgent; + $ua->agent("rdfcrawl/1.0 "); + my $req = new HTTP::Request(GET => $ftpserv . $url); + my $rescont = ''; + + my $res = $ua->request($req); + if ($res->is_success) { + $_ = $url; + if (m|.*bz2$|) { + $rescont = IPC::Filter::filter($res->content, "bzip2 -d"); + } + else { + $rescont = $res->content; } - $sql->{-package}->execute( - $prop->{'Name'}, $prop->{'Version'}, $prop->{'Release'}, $prop->{'Distribution'}, $prop->{'Group'}, - $prop->{'License'}, $prop->{'Packager'}, $prop->{'Summary'}, $prop->{'URL'}, $prop->{'Vendor'}, - $prop->{'Description'} - ); - - # store complex (multi-value) properties of a package - foreach my $attr (qw( - BuildPreReq - PreReq - Provide - )) { - foreach my $el (@{$desc->{$attr}->[0]->{'rdf:bag'}->[0]->{'resource'}}) { - my ($key, $op, $val) = ($el, '=', '*'); - if (ref($key) eq 'HASH') { - $key = $el->{'content'}; - $op = (grep { $_ ne 'content' } keys(%{$el}))[0]; - $val = $el->{$op}; + } + else { + print $res->status_line, "\n"; + } + + my $xml = new XML::Simple; + my $rdf = $xml->XMLin( + $rescont, + KeepRoot => 1, + ForceContent => 0, + ForceArray => 1, + ); + undef $xml; + +# iterate over XML/RDF data structure + foreach my $repo (@{$rdf->{'rdf:RDF'}->[0]->{'Repository'}}) { + my $rd_url = $repo->{'rdf:resource'}; + + # store repository information +# $db->begin_work(); + $sql->{-rdf}->execute($rd_url); + + # interate over all packages in a repository + if (defined($repo->{'rdf:Description'})) { + # from now on package descriptions + foreach my $desc (@{$repo->{'rdf:Description'}}) { + # store simple (single-value) properties of a package + my $prop = {}; + foreach my $attr (qw( + Name Version Release Distribution Group License + Packager Summary URL Vendor Description + )) { + $prop->{$attr} = $desc->{$attr}->[0]; + } + $sql->{-package}->execute( + $prop->{'Name'}, $prop->{'Version'}, $prop->{'Release'}, $prop->{'Distribution'}, $prop->{'Group'}, + $prop->{'License'}, $prop->{'Packager'}, $prop->{'Summary'}, $prop->{'URL'}, $prop->{'Vendor'}, + $prop->{'Description'} + ); + + # store complex (multi-value) properties of a package + foreach my $attr (qw( + BuildPreReq + PreReq + Provide + )) { + foreach my $el (@{$desc->{$attr}->[0]->{'rdf:bag'}->[0]->{'resource'}}) { + my ($key, $op, $val) = ($el, '=', '*'); + if (ref($key) eq 'HASH') { + $key = $el->{'content'}; + $op = (grep { $_ ne 'content' } keys(%{$el}))[0]; + $val = $el->{$op}; + } + $sql->{"-".lc($attr)}->execute($key, $op, $val); + } + } + foreach my $url (@{$desc->{'Source'}->[0]->{'rdf:bag'}->[0]->{'rdf:li'}}) { + $sql->{-source}->execute($url); } - $sql->{"-".lc($attr)}->execute($key, $op, $val); } } - foreach my $url (@{$desc->{'Source'}->[0]->{'rdf:bag'}->[0]->{'rdf:li'}}) { - $sql->{-source}->execute($url); + else { + # from now on rdf references + foreach my $repcont (@{$repo->{'Repository'}}) { + $url =~ m|^(.*/)|; + my $actpath = $1; + &fetchrdfsrecursendump2db($actpath . $repcont->{'href'}); + sleep(1); + } } +# $db->commit(); } - $db->commit(); } - -# disconnect from database +# commit and disconnect from database +$db->commit(); $db->disconnect(); +sub showactxml { +# temporary function for showing actual xml + + my ($xmlinput) = @_; + + my $xml = new XML::Simple; + + my $rdf = $xml->XMLin( + $xmlinput, + KeepRoot => 1, + ForceContent => 0, + ForceArray => 1, + ); +undef $xml; +print Dumper($rdf); +} + __END__ =pod