aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/share/perl/lib/XML/TreePP.pm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--share/perl/lib/XML/TreePP.pm1228
1 files changed, 1228 insertions, 0 deletions
diff --git a/share/perl/lib/XML/TreePP.pm b/share/perl/lib/XML/TreePP.pm
new file mode 100644
index 0000000..bd03db7
--- /dev/null
+++ b/share/perl/lib/XML/TreePP.pm
@@ -0,0 +1,1228 @@
1=head1 NAME
2
3XML::TreePP -- Pure Perl implementation for parsing/writing xml files
4
5=head1 SYNOPSIS
6
7parse xml file into hash tree
8
9 use XML::TreePP;
10 my $tpp = XML::TreePP->new();
11 my $tree = $tpp->parsefile( "index.rdf" );
12 print "Title: ", $tree->{"rdf:RDF"}->{item}->[0]->{title}, "\n";
13 print "URL: ", $tree->{"rdf:RDF"}->{item}->[0]->{link}, "\n";
14
15write xml as string from hash tree
16
17 use XML::TreePP;
18 my $tpp = XML::TreePP->new();
19 my $tree = { rss => { channel => { item => [ {
20 title => "The Perl Directory",
21 link => "http://www.perl.org/",
22 }, {
23 title => "The Comprehensive Perl Archive Network",
24 link => "http://cpan.perl.org/",
25 } ] } } };
26 my $xml = $tpp->write( $tree );
27 print $xml;
28
29get remote xml file with HTTP-GET and parse it into hash tree
30
31 use XML::TreePP;
32 my $tpp = XML::TreePP->new();
33 my $tree = $tpp->parsehttp( GET => "http://use.perl.org/index.rss" );
34 print "Title: ", $tree->{"rdf:RDF"}->{channel}->{title}, "\n";
35 print "URL: ", $tree->{"rdf:RDF"}->{channel}->{link}, "\n";
36
37get remote xml file with HTTP-POST and parse it into hash tree
38
39 use XML::TreePP;
40 my $tpp = XML::TreePP->new( force_array => [qw( item )] );
41 my $cgiurl = "http://search.hatena.ne.jp/keyword";
42 my $keyword = "ajax";
43 my $cgiquery = "mode=rss2&word=".$keyword;
44 my $tree = $tpp->parsehttp( POST => $cgiurl, $cgiquery );
45 print "Link: ", $tree->{rss}->{channel}->{item}->[0]->{link}, "\n";
46 print "Desc: ", $tree->{rss}->{channel}->{item}->[0]->{description}, "\n";
47
48=head1 DESCRIPTION
49
50XML::TreePP module parses XML file and expands it for a hash tree.
51And also generate XML file from a hash tree.
52This is a pure Perl implementation.
53You can also download XML from remote web server
54like XMLHttpRequest object at JavaScript language.
55
56=head1 EXAMPLES
57
58=head2 Parse XML file
59
60Sample XML source:
61
62 <?xml version="1.0" encoding="UTF-8"?>
63 <family name="Kawasaki">
64 <father>Yasuhisa</father>
65 <mother>Chizuko</mother>
66 <children>
67 <girl>Shiori</girl>
68 <boy>Yusuke</boy>
69 <boy>Kairi</boy>
70 </children>
71 </family>
72
73Sample program to read a xml file and dump it:
74
75 use XML::TreePP;
76 use Data::Dumper;
77 my $tpp = XML::TreePP->new();
78 my $tree = $tpp->parsefile( "family.xml" );
79 my $text = Dumper( $tree );
80 print $text;
81
82Result dumped:
83
84 $VAR1 = {
85 'family' => {
86 '-name' => 'Kawasaki',
87 'father' => 'Yasuhisa',
88 'mother' => 'Chizuko',
89 'children' => {
90 'girl' => 'Shiori'
91 'boy' => [
92 'Yusuke',
93 'Kairi'
94 ],
95 }
96 }
97 };
98
99Details:
100
101 print $tree->{family}->{father}; # the father's given name.
102
103The prefix '-' is added on every attribute's name.
104
105 print $tree->{family}->{"-name"}; # the family name of the family
106
107The array is used because the family has two boys.
108
109 print $tree->{family}->{children}->{boy}->[1]; # The second boy's name
110 print $tree->{family}->{children}->{girl}; # The girl's name
111
112=head2 Text node and attributes:
113
114If a element has both of a text node and attributes
115or both of a text node and other child nodes,
116value of a text node is moved to C<#text> like child nodes.
117
118 use XML::TreePP;
119 use Data::Dumper;
120 my $tpp = XML::TreePP->new();
121 my $source = '<span class="author">Kawasaki Yusuke</span>';
122 my $tree = $tpp->parse( $source );
123 my $text = Dumper( $tree );
124 print $text;
125
126The result dumped is following:
127
128 $VAR1 = {
129 'span' => {
130 '-class' => 'author',
131 '#text' => 'Kawasaki Yusuke'
132 }
133 };
134
135The special node name of C<#text> is used because this elements
136has attribute(s) in addition to the text node.
137See also L</text_node_key> option.
138
139=head1 METHODS
140
141=head2 new
142
143This constructor method returns a new XML::TreePP object with C<%options>.
144
145 $tpp = XML::TreePP->new( %options );
146
147=head2 set
148
149This method sets a option value for C<option_name>.
150If C<$option_value> is not defined, its option is deleted.
151
152 $tpp->set( option_name => $option_value );
153
154See OPTIONS section below for details.
155
156=head2 get
157
158This method returns a current option value for C<option_name>.
159
160 $tpp->get( 'option_name' );
161
162=head2 parse
163
164This method reads XML source and returns a hash tree converted.
165The first argument is a scalar or a reference to a scalar.
166
167 $tree = $tpp->parse( $source );
168
169=head2 parsefile
170
171This method reads a XML file and returns a hash tree converted.
172The first argument is a filename.
173
174 $tree = $tpp->parsefile( $file );
175
176=head2 parsehttp
177
178This method receives a XML file from a remote server via HTTP and
179returns a hash tree converted.
180
181 $tree = $tpp->parsehttp( $method, $url, $body, $head );
182
183C<$method> is a method of HTTP connection: GET/POST/PUT/DELETE
184C<$url> is an URI of a XML file.
185C<$body> is a request body when you use POST method.
186C<$head> is a request headers as a hash ref.
187L<LWP::UserAgent> module or L<HTTP::Lite> module is required to fetch a file.
188
189 ( $tree, $xml, $code ) = $tpp->parsehttp( $method, $url, $body, $head );
190
191In array context, This method returns also raw XML source received
192and HTTP response's status code.
193
194=head2 write
195
196This method parses a hash tree and returns a XML source generated.
197
198 $source = $tpp->write( $tree, $encode );
199
200C<$tree> is a reference to a hash tree.
201
202=head2 writefile
203
204This method parses a hash tree and writes a XML source into a file.
205
206 $tpp->writefile( $file, $tree, $encode );
207
208C<$file> is a filename to create.
209C<$tree> is a reference to a hash tree.
210
211=head1 OPTIONS FOR PARSING XML
212
213This module accepts option parameters following:
214
215=head2 force_array
216
217This option allows you to specify a list of element names which
218should always be forced into an array representation.
219
220 $tpp->set( force_array => [ 'rdf:li', 'item', '-xmlns' ] );
221
222The default value is null, it means that context of the elements
223will determine to make array or to keep it scalar or hash.
224Note that the special wildcard name C<'*'> means all elements.
225
226=head2 force_hash
227
228This option allows you to specify a list of element names which
229should always be forced into an hash representation.
230
231 $tpp->set( force_hash => [ 'item', 'image' ] );
232
233The default value is null, it means that context of the elements
234will determine to make hash or to keep it scalar as a text node.
235See also L</text_node_key> option below.
236Note that the special wildcard name C<'*'> means all elements.
237
238=head2 cdata_scalar_ref
239
240This option allows you to convert a cdata section into a reference
241for scalar on parsing XML source.
242
243 $tpp->set( cdata_scalar_ref => 1 );
244
245The default value is false, it means that each cdata section is converted into a scalar.
246
247=head2 user_agent
248
249This option allows you to specify a HTTP_USER_AGENT string which
250is used by parsehttp() method.
251
252 $tpp->set( user_agent => 'Mozilla/4.0 (compatible; ...)' );
253
254The default string is C<'XML-TreePP/#.##'>, where C<'#.##'> is
255substituted with the version number of this library.
256
257=head2 http_lite
258
259This option forces pasrsehttp() method to use a L<HTTP::Lite> instance.
260
261 my $http = HTTP::Lite->new();
262 $tpp->set( http_lite => $http );
263
264=head2 lwp_useragent
265
266This option forces pasrsehttp() method to use a L<LWP::UserAgent> instance.
267
268 my $ua = LWP::UserAgent->new();
269 $ua->timeout( 60 );
270 $ua->env_proxy;
271 $tpp->set( lwp_useragent => $ua );
272
273You may use this with L<LWP::UserAgent::WithCache>.
274
275=head2 base_class
276
277This blesses class name for each element's hashref.
278Each class is named straight as a child class of it parent class.
279
280 $tpp->set( base_class => 'MyElement' );
281 my $xml = '<root><parent><child key="val">text</child></parent></root>';
282 my $tree = $tpp->parse( $xml );
283 print ref $tree->{root}->{parent}->{child}, "\n";
284
285A hash for <child> element above is blessed to C<MyElement::root::parent::child>
286class. You may use this with L<Class::Accessor>.
287
288=head2 elem_class
289
290This blesses class name for each element's hashref.
291Each class is named horizontally under the direct child of C<MyElement>.
292
293 $tpp->set( base_class => 'MyElement' );
294 my $xml = '<root><parent><child key="val">text</child></parent></root>';
295 my $tree = $tpp->parse( $xml );
296 print ref $tree->{root}->{parent}->{child}, "\n";
297
298A hash for <child> element above is blessed to C<MyElement::child> class.
299
300=head1 OPTIONS FOR WRITING XML
301
302=head2 first_out
303
304This option allows you to specify a list of element/attribute
305names which should always appears at first on output XML code.
306
307 $tpp->set( first_out => [ 'link', 'title', '-type' ] );
308
309The default value is null, it means alphabetical order is used.
310
311=head2 last_out
312
313This option allows you to specify a list of element/attribute
314names which should always appears at last on output XML code.
315
316 $tpp->set( last_out => [ 'items', 'item', 'entry' ] );
317
318=head2 indent
319
320This makes the output more human readable by indenting appropriately.
321
322 $tpp->set( indent => 2 );
323
324This doesn't strictly follow the XML Document Spec but does looks nice.
325
326=head2 xml_decl
327
328This module generates an XML declaration on writing an XML code per default.
329This option forces to change or leave it.
330
331 $tpp->set( xml_decl => '' );
332
333=head2 output_encoding
334
335This option allows you to specify a encoding of xml file generated
336by write/writefile methods.
337
338 $tpp->set( output_encoding => 'UTF-8' );
339
340On Perl 5.8.0 and later, you can select it from every
341encodings supported by Encode.pm. On Perl 5.6.x and before with
342Jcode.pm, you can use C<Shift_JIS>, C<EUC-JP>, C<ISO-2022-JP> and
343C<UTF-8>. The default value is C<UTF-8> which is recommended encoding.
344
345=head1 OPTIONS FOR BOTH
346
347=head2 utf8_flag
348
349This makes utf8 flag on for every element's value parsed
350and makes it on for an XML code generated as well.
351
352 $tpp->set( utf8_flag => 1 );
353
354Perl 5.8.1 or later is required to use this.
355
356=head2 attr_prefix
357
358This option allows you to specify a prefix character(s) which
359is inserted before each attribute names.
360
361 $tpp->set( attr_prefix => '@' );
362
363The default character is C<'-'>.
364Or set C<'@'> to access attribute values like E4X, ECMAScript for XML.
365Zero-length prefix C<''> is available as well, it means no prefix is added.
366
367=head2 text_node_key
368
369This option allows you to specify a hash key for text nodes.
370
371 $tpp->set( text_node_key => '#text' );
372
373The default key is C<#text>.
374
375=head2 ignore_error
376
377This module calls Carp::croak function on an error per default.
378This option makes all errors ignored and just return.
379
380 $tpp->set( ignore_error => 1 );
381
382=head2 use_ixhash
383
384This option keeps the order for each element appeared in XML.
385L<Tie::IxHash> module is required.
386
387 $tpp->set( use_ixhash => 1 );
388
389This makes parsing performance slow.
390(about 100% slower than default)
391
392=head1 AUTHOR
393
394Yusuke Kawasaki, http://www.kawa.net/
395
396=head1 COPYRIGHT AND LICENSE
397
398Copyright (c) 2006-2007 Yusuke Kawasaki. All rights reserved.
399This program is free software; you can redistribute it and/or
400modify it under the same terms as Perl itself.
401
402=cut
403
404package XML::TreePP;
405use strict;
406use Carp;
407use Symbol;
408
409use vars qw( $VERSION );
410$VERSION = '0.32';
411
412my $XML_ENCODING = 'UTF-8';
413my $INTERNAL_ENCODING = 'UTF-8';
414my $USER_AGENT = 'XML-TreePP/'.$VERSION.' ';
415my $ATTR_PREFIX = '-';
416my $TEXT_NODE_KEY = '#text';
417
418sub new {
419 my $package = shift;
420 my $self = {@_};
421 bless $self, $package;
422 $self;
423}
424
425sub die {
426 my $self = shift;
427 my $mess = shift;
428 return if $self->{ignore_error};
429 Carp::croak $mess;
430}
431
432sub warn {
433 my $self = shift;
434 my $mess = shift;
435 return if $self->{ignore_error};
436 Carp::carp $mess;
437}
438
439sub set {
440 my $self = shift;
441 my $key = shift;
442 my $val = shift;
443 if ( defined $val ) {
444 $self->{$key} = $val;
445 }
446 else {
447 delete $self->{$key};
448 }
449}
450
451sub get {
452 my $self = shift;
453 my $key = shift;
454 $self->{$key} if exists $self->{$key};
455}
456
457sub writefile {
458 my $self = shift;
459 my $file = shift;
460 my $tree = shift or return $self->die( 'Invalid tree' );
461 my $encode = shift;
462 return $self->die( 'Invalid filename' ) unless defined $file;
463 my $text = $self->write( $tree, $encode );
464 if ( $] >= 5.008001 && utf8::is_utf8( $text ) ) {
465 utf8::encode( $text );
466 }
467 $self->write_raw_xml( $file, $text );
468}
469
470sub write {
471 my $self = shift;
472 my $tree = shift or return $self->die( 'Invalid tree' );
473 my $from = $self->{internal_encoding} || $INTERNAL_ENCODING;
474 my $to = shift || $self->{output_encoding} || $XML_ENCODING;
475 my $decl = $self->{xml_decl};
476 $decl = '<?xml version="1.0" encoding="' . $to . '" ?>' unless defined $decl;
477
478 local $self->{__first_out};
479 if ( exists $self->{first_out} ) {
480 my $keys = $self->{first_out};
481 $keys = [$keys] unless ref $keys;
482 $self->{__first_out} = { map { $keys->[$_] => $_ } 0 .. $#$keys };
483 }
484
485 local $self->{__last_out};
486 if ( exists $self->{last_out} ) {
487 my $keys = $self->{last_out};
488 $keys = [$keys] unless ref $keys;
489 $self->{__last_out} = { map { $keys->[$_] => $_ } 0 .. $#$keys };
490 }
491
492 my $tnk = $self->{text_node_key} if exists $self->{text_node_key};
493 $tnk = $TEXT_NODE_KEY unless defined $tnk;
494 local $self->{text_node_key} = $tnk;
495
496 my $apre = $self->{attr_prefix} if exists $self->{attr_prefix};
497 $apre = $ATTR_PREFIX unless defined $apre;
498 local $self->{__attr_prefix_len} = length($apre);
499 local $self->{__attr_prefix_rex} = defined $apre ? qr/^\Q$apre\E/s : undef;
500
501 local $self->{__indent};
502 if ( exists $self->{indent} && $self->{indent} ) {
503 $self->{__indent} = ' ' x $self->{indent};
504 }
505
506 my $text = $self->hash_to_xml( undef, $tree );
507 if ( $from && $to ) {
508 my $stat = $self->encode_from_to( \$text, $from, $to );
509 return $self->die( "Unsupported encoding: $to" ) unless $stat;
510 }
511
512 return $text if ( $decl eq '' );
513 join( "\n", $decl, $text );
514}
515
516sub parsehttp {
517 my $self = shift;
518
519 local $self->{__user_agent};
520 if ( exists $self->{user_agent} ) {
521 my $agent = $self->{user_agent};
522 $agent .= $USER_AGENT if ( $agent =~ /\s$/s );
523 $self->{__user_agent} = $agent if ( $agent ne '' );
524 } else {
525 $self->{__user_agent} = $USER_AGENT;
526 }
527
528 my $http = $self->{__http_module};
529 unless ( $http ) {
530 $http = $self->find_http_module(@_);
531 $self->{__http_module} = $http;
532 }
533 if ( $http eq 'LWP::UserAgent' ) {
534 return $self->parsehttp_lwp(@_);
535 }
536 elsif ( $http eq 'HTTP::Lite' ) {
537 return $self->parsehttp_lite(@_);
538 }
539 else {
540 return $self->die( "LWP::UserAgent or HTTP::Lite is required: $_[1]" );
541 }
542}
543
544sub find_http_module {
545 my $self = shift || {};
546
547 if ( exists $self->{lwp_useragent} && ref $self->{lwp_useragent} ) {
548 return 'LWP::UserAgent' if defined $LWP::UserAgent::VERSION;
549 return 'LWP::UserAgent' if &load_lwp_useragent();
550 return $self->die( "LWP::UserAgent is required: $_[1]" );
551 }
552
553 if ( exists $self->{http_lite} && ref $self->{http_lite} ) {
554 return 'HTTP::Lite' if defined $HTTP::Lite::VERSION;
555 return 'HTTP::Lite' if &load_http_lite();
556 return $self->die( "HTTP::Lite is required: $_[1]" );
557 }
558
559 return 'LWP::UserAgent' if defined $LWP::UserAgent::VERSION;
560 return 'HTTP::Lite' if defined $HTTP::Lite::VERSION;
561 return 'LWP::UserAgent' if &load_lwp_useragent();
562 return 'HTTP::Lite' if &load_http_lite();
563 return $self->die( "LWP::UserAgent or HTTP::Lite is required: $_[1]" );
564}
565
566sub load_lwp_useragent {
567 return $LWP::UserAgent::VERSION if defined $LWP::UserAgent::VERSION;
568 local $@;
569 eval { require LWP::UserAgent; };
570 $LWP::UserAgent::VERSION;
571}
572
573sub load_http_lite {
574 return $HTTP::Lite::VERSION if defined $HTTP::Lite::VERSION;
575 local $@;
576 eval { require HTTP::Lite; };
577 $HTTP::Lite::VERSION;
578}
579
580sub load_tie_ixhash {
581 return $Tie::IxHash::VERSION if defined $Tie::IxHash::VERSION;
582 local $@;
583 eval { require Tie::IxHash; };
584 $Tie::IxHash::VERSION;
585}
586
587sub parsehttp_lwp {
588 my $self = shift;
589 my $method = shift or return $self->die( 'Invalid HTTP method' );
590 my $url = shift or return $self->die( 'Invalid URL' );
591 my $body = shift;
592 my $header = shift;
593
594 my $ua = $self->{lwp_useragent} if exists $self->{lwp_useragent};
595 if ( ! ref $ua ) {
596 $ua = LWP::UserAgent->new();
597 $ua->timeout(10);
598 $ua->env_proxy();
599 $ua->agent( $self->{__user_agent} ) if defined $self->{__user_agent};
600 } else {
601 $ua->agent( $self->{__user_agent} ) if exists $self->{user_agent};
602 }
603
604 my $req = HTTP::Request->new( $method, $url );
605 my $ct = 0;
606 if ( ref $header ) {
607 foreach my $field ( sort keys %$header ) {
608 my $value = $header->{$field};
609 $req->header( $field => $value );
610 $ct ++ if ( $field =~ /^Content-Type$/i );
611 }
612 }
613 if ( defined $body && ! $ct ) {
614 $req->header( 'Content-Type' => 'application/x-www-form-urlencoded' );
615 }
616 $req->content($body) if defined $body;
617 my $res = $ua->request($req);
618 my $code = $res->code();
619 my $text = $res->content();
620 my $tree = $self->parse( \$text ) if $res->is_success();
621 wantarray ? ( $tree, $text, $code ) : $tree;
622}
623
624sub parsehttp_lite {
625 my $self = shift;
626 my $method = shift or return $self->die( 'Invalid HTTP method' );
627 my $url = shift or return $self->die( 'Invalid URL' );
628 my $body = shift;
629 my $header = shift;
630
631 my $http = HTTP::Lite->new();
632 $http->method($method);
633 my $ua = 0;
634 if ( ref $header ) {
635 foreach my $field ( sort keys %$header ) {
636 my $value = $header->{$field};
637 $http->add_req_header( $field, $value );
638 $ua ++ if ( $field =~ /^User-Agent$/i );
639 }
640 }
641 if ( defined $self->{__user_agent} && ! $ua ) {
642 $http->add_req_header( 'User-Agent', $self->{__user_agent} );
643 }
644 $http->{content} = $body if defined $body;
645 my $code = $http->request($url) or return;
646 my $text = $http->body();
647 my $tree = $self->parse( \$text );
648 wantarray ? ( $tree, $text, $code ) : $tree;
649}
650
651sub parsefile {
652 my $self = shift;
653 my $file = shift;
654 return $self->die( 'Invalid filename' ) unless defined $file;
655 my $text = $self->read_raw_xml($file);
656 $self->parse( \$text );
657}
658
659sub parse {
660 my $self = shift;
661 my $text = ref $_[0] ? ${$_[0]} : $_[0];
662 return $self->die( 'Null XML source' ) unless defined $text;
663
664 my $from = &xml_decl_encoding(\$text) || $XML_ENCODING;
665 my $to = $self->{internal_encoding} || $INTERNAL_ENCODING;
666 if ( $from && $to ) {
667 my $stat = $self->encode_from_to( \$text, $from, $to );
668 return $self->die( "Unsupported encoding: $from" ) unless $stat;
669 }
670
671 local $self->{__force_array};
672 local $self->{__force_array_all};
673 if ( exists $self->{force_array} ) {
674 my $force = $self->{force_array};
675 $force = [$force] unless ref $force;
676 $self->{__force_array} = { map { $_ => 1 } @$force };
677 $self->{__force_array_all} = $self->{__force_array}->{'*'};
678 }
679
680 local $self->{__force_hash};
681 local $self->{__force_hash_all};
682 if ( exists $self->{force_hash} ) {
683 my $force = $self->{force_hash};
684 $force = [$force] unless ref $force;
685 $self->{__force_hash} = { map { $_ => 1 } @$force };
686 $self->{__force_hash_all} = $self->{__force_hash}->{'*'};
687 }
688
689 my $tnk = $self->{text_node_key} if exists $self->{text_node_key};
690 $tnk = $TEXT_NODE_KEY unless defined $tnk;
691 local $self->{text_node_key} = $tnk;
692
693 my $apre = $self->{attr_prefix} if exists $self->{attr_prefix};
694 $apre = $ATTR_PREFIX unless defined $apre;
695 local $self->{attr_prefix} = $apre;
696
697 if ( exists $self->{use_ixhash} && $self->{use_ixhash} ) {
698 return $self->die( "Tie::IxHash is required." ) unless &load_tie_ixhash();
699 }
700
701 my $flat = $self->xml_to_flat(\$text);
702 my $class = $self->{base_class} if exists $self->{base_class};
703 my $tree = $self->flat_to_tree( $flat, '', $class );
704 if ( ref $tree ) {
705 if ( defined $class ) {
706 bless( $tree, $class );
707 }
708 elsif ( exists $self->{elem_class} && $self->{elem_class} ) {
709 bless( $tree, $self->{elem_class} );
710 }
711 }
712 wantarray ? ( $tree, $text ) : $tree;
713}
714
715sub xml_to_flat {
716 my $self = shift;
717 my $textref = shift; # reference
718 my $flat = [];
719 my $prefix = $self->{attr_prefix};
720 my $ixhash = ( exists $self->{use_ixhash} && $self->{use_ixhash} );
721
722 while ( $$textref =~ m{
723 ([^<]*) <
724 ((
725 \? ([^<>]*) \?
726 )|(
727 \!\[CDATA\[(.*?)\]\]
728 )|(
729 \!DOCTYPE\s+([^\[\]<>]*(?:\[.*?\]\s*)?)
730 )|(
731 \!--(.*?)--
732 )|(
733 ([^\!\?\s<>](?:"[^"]*"|'[^']*'|[^"'<>])*)
734 ))
735 > ([^<]*)
736 }sxg ) {
737 my (
738 $ahead, $match, $typePI, $contPI, $typeCDATA,
739 $contCDATA, $typeDocT, $contDocT, $typeCmnt, $contCmnt,
740 $typeElem, $contElem, $follow
741 )
742 = ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13 );
743 if ( defined $ahead && $ahead =~ /\S/ ) {
744 $self->warn( "Invalid string: [$ahead] before <$match>" );
745 }
746
747 if ($typeElem) { # Element
748 my $node = {};
749 if ( $contElem =~ s#^/## ) {
750 $node->{endTag}++;
751 }
752 elsif ( $contElem =~ s#/$## ) {
753 $node->{emptyTag}++;
754 }
755 else {
756 $node->{startTag}++;
757 }
758 $node->{tagName} = $1 if ( $contElem =~ s#^(\S+)\s*## );
759 unless ( $node->{endTag} ) {
760 my $attr;
761 while ( $contElem =~ m{
762 ([^\s\=\"\']+)=(?:(")(.*?)"|'(.*?)')
763 }sxg ) {
764 my $key = $1;
765 my $val = &xml_unescape( $2 ? $3 : $4 );
766 if ( ! ref $attr ) {
767 $attr = {};
768 tie( %$attr, 'Tie::IxHash' ) if $ixhash;
769 }
770 $attr->{$prefix.$key} = $val;
771 }
772 $node->{attributes} = $attr if ref $attr;
773 }
774 push( @$flat, $node );
775 }
776 elsif ($typeCDATA) { ## CDATASection
777 if ( exists $self->{cdata_scalar_ref} && $self->{cdata_scalar_ref} ) {
778 push( @$flat, \$contCDATA ); # as reference for scalar
779 }
780 else {
781 push( @$flat, $contCDATA ); # as scalar like text node
782 }
783 }
784 elsif ($typeCmnt) { # Comment (ignore)
785 }
786 elsif ($typeDocT) { # DocumentType (ignore)
787 }
788 elsif ($typePI) { # ProcessingInstruction (ignore)
789 }
790 else {
791 $self->warn( "Invalid Tag: <$match>" );
792 }
793 if ( $follow =~ /\S/ ) { # text node
794 my $val = &xml_unescape($follow);
795 push( @$flat, $val );
796 }
797 }
798 $flat;
799}
800
801sub flat_to_tree {
802 my $self = shift;
803 my $source = shift;
804 my $parent = shift;
805 my $class = shift;
806 my $tree = {};
807 my $text = [];
808
809 if ( exists $self->{use_ixhash} && $self->{use_ixhash} ) {
810 tie( %$tree, 'Tie::IxHash' );
811 }
812
813 while ( scalar @$source ) {
814 my $node = shift @$source;
815 if ( !ref $node || UNIVERSAL::isa( $node, "SCALAR" ) ) {
816 push( @$text, $node ); # cdata or text node
817 next;
818 }
819 my $name = $node->{tagName};
820 if ( $node->{endTag} ) {
821 last if ( $parent eq $name );
822 return $self->die( "Invalid tag sequence: <$parent></$name>" );
823 }
824 my $elem = $node->{attributes};
825 my $forcehash = $self->{__force_hash_all} || $self->{__force_hash}->{$name};
826 my $subclass;
827 if ( defined $class ) {
828 my $escname = $name;
829 $escname =~ s/\W/_/sg;
830 $subclass = $class.'::'.$escname;
831 }
832 if ( $node->{startTag} ) { # recursive call
833 my $child = $self->flat_to_tree( $source, $name, $subclass );
834 next unless defined $child;
835 my $hasattr = scalar keys %$elem if ref $elem;
836 if ( UNIVERSAL::isa( $child, "HASH" ) ) {
837 if ( $hasattr ) {
838 # some attributes and some child nodes
839 %$elem = ( %$elem, %$child );
840 }
841 else {
842 # some child nodes without attributes
843 $elem = $child;
844 }
845 }
846 else {
847 if ( $hasattr ) {
848 # some attributes and text node
849 $elem->{$self->{text_node_key}} = $child;
850 }
851 elsif ( $forcehash ) {
852 # only text node without attributes
853 $elem = { $self->{text_node_key} => $child };
854 }
855 else {
856 # text node without attributes
857 $elem = $child;
858 }
859 }
860 }
861 elsif ( $forcehash && ! ref $elem ) {
862 $elem = {};
863 }
864 # bless to a class by base_class or elem_class
865 if ( ref $elem && UNIVERSAL::isa( $elem, "HASH" ) ) {
866 if ( defined $subclass ) {
867 bless( $elem, $subclass );
868 } elsif ( exists $self->{elem_class} && $self->{elem_class} ) {
869 my $escname = $name;
870 $escname =~ s/\W/_/sg;
871 my $elmclass = $self->{elem_class}.'::'.$escname;
872 bless( $elem, $elmclass );
873 }
874 }
875 # next unless defined $elem;
876 $tree->{$name} ||= [];
877 push( @{ $tree->{$name} }, $elem );
878 }
879 if ( ! $self->{__force_array_all} ) {
880 foreach my $key ( keys %$tree ) {
881 next if $self->{__force_array}->{$key};
882 next if ( 1 < scalar @{ $tree->{$key} } );
883 $tree->{$key} = shift @{ $tree->{$key} };
884 }
885 }
886 my $haschild = scalar keys %$tree;
887 if ( scalar @$text ) {
888 if ( scalar @$text == 1 ) {
889 # one text node (normal)
890 $text = shift @$text;
891 }
892 elsif ( ! scalar grep {ref $_} @$text ) {
893 # some text node splitted
894 $text = join( '', @$text );
895 }
896 else {
897 # some cdata node
898 my $join = join( '', map {ref $_ ? $$_ : $_} @$text );
899 $text = \$join;
900 }
901 if ( $haschild ) {
902 # some child nodes and also text node
903 $tree->{$self->{text_node_key}} = $text;
904 }
905 else {
906 # only text node without child nodes
907 $tree = $text;
908 }
909 }
910 elsif ( ! $haschild ) {
911 # no child and no text
912 $tree = "";
913 }
914 $tree;
915}
916
917sub hash_to_xml {
918 my $self = shift;
919 my $name = shift;
920 my $hash = shift;
921 my $out = [];
922 my $attr = [];
923 my $allkeys = [ keys %$hash ];
924 my $fo = $self->{__first_out} if ref $self->{__first_out};
925 my $lo = $self->{__last_out} if ref $self->{__last_out};
926 my $firstkeys = [ sort { $fo->{$a} <=> $fo->{$b} } grep { exists $fo->{$_} } @$allkeys ] if ref $fo;
927 my $lastkeys = [ sort { $lo->{$a} <=> $lo->{$b} } grep { exists $lo->{$_} } @$allkeys ] if ref $lo;
928 $allkeys = [ grep { ! exists $fo->{$_} } @$allkeys ] if ref $fo;
929 $allkeys = [ grep { ! exists $lo->{$_} } @$allkeys ] if ref $lo;
930 unless ( exists $self->{use_ixhash} && $self->{use_ixhash} ) {
931 $allkeys = [ sort @$allkeys ];
932 }
933 my $prelen = $self->{__attr_prefix_len};
934 my $pregex = $self->{__attr_prefix_rex};
935
936 foreach my $keys ( $firstkeys, $allkeys, $lastkeys ) {
937 next unless ref $keys;
938 my $elemkey = $prelen ? [ grep { $_ !~ $pregex } @$keys ] : $keys;
939 my $attrkey = $prelen ? [ grep { $_ =~ $pregex } @$keys ] : [];
940
941 foreach my $key ( @$elemkey ) {
942 my $val = $hash->{$key};
943 if ( !defined $val ) {
944 push( @$out, "<$key />" );
945 }
946 elsif ( UNIVERSAL::isa( $val, 'ARRAY' ) ) {
947 my $child = $self->array_to_xml( $key, $val );
948 push( @$out, $child );
949 }
950 elsif ( UNIVERSAL::isa( $val, 'SCALAR' ) ) {
951 my $child = $self->scalaref_to_cdata( $key, $val );
952 push( @$out, $child );
953 }
954 elsif ( ref $val ) {
955 my $child = $self->hash_to_xml( $key, $val );
956 push( @$out, $child );
957 }
958 else {
959 my $child = $self->scalar_to_xml( $key, $val );
960 push( @$out, $child );
961 }
962 }
963
964 foreach my $key ( @$attrkey ) {
965 my $name = substr( $key, $prelen );
966 my $val = &xml_escape( $hash->{$key} );
967 push( @$attr, ' ' . $name . '="' . $val . '"' );
968 }
969 }
970 my $jattr = join( '', @$attr );
971
972 if ( defined $name && scalar @$out && ! grep { ! /^</s } @$out ) {
973 # Use human-friendly white spacing
974 if ( defined $self->{__indent} ) {
975 s/^(\s*<)/$self->{__indent}$1/mg foreach @$out;
976 }
977 unshift( @$out, "\n" );
978 }
979
980 my $text = join( '', @$out );
981 if ( defined $name ) {
982 if ( scalar @$out ) {
983 $text = "<$name$jattr>$text</$name>\n";
984 }
985 else {
986 $text = "<$name$jattr />\n";
987 }
988 }
989 $text;
990}
991
992sub array_to_xml {
993 my $self = shift;
994 my $name = shift;
995 my $array = shift;
996 my $out = [];
997 foreach my $val (@$array) {
998 if ( !defined $val ) {
999 push( @$out, "<$name />\n" );
1000 }
1001 elsif ( UNIVERSAL::isa( $val, 'ARRAY' ) ) {
1002 my $child = $self->array_to_xml( $name, $val );
1003 push( @$out, $child );
1004 }
1005 elsif ( UNIVERSAL::isa( $val, 'SCALAR' ) ) {
1006 my $child = $self->scalaref_to_cdata( $name, $val );
1007 push( @$out, $child );
1008 }
1009 elsif ( ref $val ) {
1010 my $child = $self->hash_to_xml( $name, $val );
1011 push( @$out, $child );
1012 }
1013 else {
1014 my $child = $self->scalar_to_xml( $name, $val );
1015 push( @$out, $child );
1016 }
1017 }
1018
1019 my $text = join( '', @$out );
1020 $text;
1021}
1022
1023sub scalaref_to_cdata {
1024 my $self = shift;
1025 my $name = shift;
1026 my $ref = shift;
1027 my $data = defined $$ref ? $$ref : '';
1028 $data =~ s#(]])(>)#$1]]><![CDATA[$2#g;
1029 #my $text = '<![CDATA[' . $data . ']]>';
1030 my $text = $data;
1031 $text = "<$name>$text</$name>\n" if ( $name ne $self->{text_node_key} );
1032 $text;
1033}
1034
1035sub scalar_to_xml {
1036 my $self = shift;
1037 my $name = shift;
1038 my $scalar = shift;
1039 my $copy = $scalar;
1040 my $text = &xml_escape($copy);
1041 $text = "<$name>$text</$name>\n" if ( $name ne $self->{text_node_key} );
1042 $text;
1043}
1044
1045sub write_raw_xml {
1046 my $self = shift;
1047 my $file = shift;
1048 my $fh = Symbol::gensym();
1049 open( $fh, ">$file" ) or return $self->die( "$! - $file" );
1050 print $fh @_;
1051 close($fh);
1052}
1053
1054sub read_raw_xml {
1055 my $self = shift;
1056 my $file = shift;
1057 my $fh = Symbol::gensym();
1058 open( $fh, $file ) or return $self->die( "$! - $file" );
1059 local $/ = undef;
1060 my $text = <$fh>;
1061 close($fh);
1062 $text;
1063}
1064
1065sub xml_decl_encoding {
1066 my $textref = shift;
1067 return unless defined $$textref;
1068 my $args = ( $$textref =~ /^\s*<\?xml(\s+\S.*)\?>/s )[0] or return;
1069 my $getcode = ( $args =~ /\s+encoding=(".*?"|'.*?')/ )[0] or return;
1070 $getcode =~ s/^['"]//;
1071 $getcode =~ s/['"]$//;
1072 $getcode;
1073}
1074
1075sub encode_from_to {
1076 my $self = shift;
1077 my $txtref = shift or return;
1078 my $from = shift or return;
1079 my $to = shift or return;
1080
1081 unless ( defined $Encode::EUCJPMS::VERSION ) {
1082 $from = 'EUC-JP' if ( $from =~ /\beuc-?jp-?(win|ms)$/i );
1083 $to = 'EUC-JP' if ( $to =~ /\beuc-?jp-?(win|ms)$/i );
1084 }
1085
1086 my $setflag = $self->{utf8_flag} if exists $self->{utf8_flag};
1087 if ( $] < 5.008001 && $setflag ) {
1088 return $self->die( "Perl 5.8.1 is required for utf8_flag: $]" );
1089 }
1090
1091 if ( $] >= 5.008 ) {
1092 &load_encode();
1093 my $check = ( $Encode::VERSION < 2.13 ) ? 0x400 : Encode::FB_XMLCREF();
1094 if ( $] >= 5.008001 && utf8::is_utf8( $$txtref ) ) {
1095 if ( $to =~ /^utf-?8$/i ) {
1096 # skip
1097 } else {
1098 $$txtref = Encode::encode( $to, $$txtref, $check );
1099 }
1100 } else {
1101 $$txtref = Encode::decode( $from, $$txtref );
1102 if ( $to =~ /^utf-?8$/i && $setflag ) {
1103 # skip
1104 } else {
1105 $$txtref = Encode::encode( $to, $$txtref, $check );
1106 }
1107 }
1108 }
1109 elsif ( ( uc($from) eq 'ISO-8859-1'
1110 || uc($from) eq 'US-ASCII'
1111 || uc($from) eq 'LATIN-1' ) && uc($to) eq 'UTF-8' ) {
1112 &latin1_to_utf8($txtref);
1113 }
1114 else {
1115 my $jfrom = &get_jcode_name($from);
1116 my $jto = &get_jcode_name($to);
1117 return $to if ( uc($jfrom) eq uc($jto) );
1118 if ( $jfrom && $jto ) {
1119 &load_jcode();
1120 if ( defined $Jcode::VERSION ) {
1121 Jcode::convert( $txtref, $jto, $jfrom );
1122 }
1123 else {
1124 return $self->die( "Jcode.pm is required: $from to $to" );
1125 }
1126 }
1127 else {
1128 return $self->die( "Encode.pm is required: $from to $to" );
1129 }
1130 }
1131 $to;
1132}
1133
1134sub load_jcode {
1135 return if defined $Jcode::VERSION;
1136 local $@;
1137 eval { require Jcode; };
1138}
1139
1140sub load_encode {
1141 return if defined $Encode::VERSION;
1142 local $@;
1143 eval { require Encode; };
1144}
1145
1146sub latin1_to_utf8 {
1147 my $strref = shift;
1148 $$strref =~ s{
1149 ([\x80-\xFF])
1150 }{
1151 pack( 'C2' => 0xC0|(ord($1)>>6),0x80|(ord($1)&0x3F) )
1152 }exg;
1153}
1154
1155sub get_jcode_name {
1156 my $src = shift;
1157 my $dst;
1158 if ( $src =~ /^utf-?8$/i ) {
1159 $dst = 'utf8';
1160 }
1161 elsif ( $src =~ /^euc.*jp(-?(win|ms))?$/i ) {
1162 $dst = 'euc';
1163 }
1164 elsif ( $src =~ /^(shift.*jis|cp932|windows-31j)$/i ) {
1165 $dst = 'sjis';
1166 }
1167 elsif ( $src =~ /^iso-2022-jp/ ) {
1168 $dst = 'jis';
1169 }
1170 $dst;
1171}
1172
1173sub xml_escape {
1174 my $str = shift;
1175 return '' unless defined $str;
1176 # except for TAB(\x09),CR(\x0D),LF(\x0A)
1177 $str =~ s{
1178 ([\x00-\x08\x0B\x0C\x0E-\x1F\x7F])
1179 }{
1180 sprintf( '&#%d;', ord($1) );
1181 }gex;
1182 $str =~ s/&(?!#(\d+;|x[\dA-Fa-f]+;))/&amp;/g;
1183 $str =~ s/</&lt;/g;
1184 $str =~ s/>/&gt;/g;
1185 $str =~ s/'/&apos;/g;
1186 $str =~ s/"/&quot;/g;
1187 $str;
1188}
1189
1190sub xml_unescape {
1191 my $str = shift;
1192 my $map = {qw( quot " lt < gt > apos ' amp & )};
1193 $str =~ s{
1194 (&(?:\#(\d+)|\#x([0-9a-fA-F]+)|(quot|lt|gt|apos|amp));)
1195 }{
1196 $4 ? $map->{$4} : &char_deref($1,$2,$3);
1197 }gex;
1198 $str;
1199}
1200
1201sub char_deref {
1202 my( $str, $dec, $hex ) = @_;
1203 if ( defined $dec ) {
1204 return &code_to_utf8( $dec ) if ( $dec < 256 );
1205 }
1206 elsif ( defined $hex ) {
1207 my $num = hex($hex);
1208 return &code_to_utf8( $num ) if ( $num < 256 );
1209 }
1210 return $str;
1211}
1212
1213sub code_to_utf8 {
1214 my $code = shift;
1215 if ( $code < 128 ) {
1216 return pack( C => $code );
1217 }
1218 elsif ( $code < 256 ) {
1219 return pack( C2 => 0xC0|($code>>6), 0x80|($code&0x3F));
1220 }
1221 elsif ( $code < 65536 ) {
1222 return pack( C3 => 0xC0|($code>>12), 0x80|(($code>>6)&0x3F), 0x80|($code&0x3F));
1223 }
1224 return shift if scalar @_; # default value
1225 sprintf( '&#x%04X;', $code );
1226}
1227
12281;