<?php // encoding="UTF-8"
/**

HtmlInc, stream methods to get HTML informations (even throw http)

© 2010, École nationale des chartes, licence CeCILL-C (LGPL compatible droit français)

In a document point of view, an HTML file is a body and metadatas. Metadatas are useful as an array in memory. The body could be obtained as a string, or pour in a stream, useful for slow remote web services, or to cache dynamic contents. To extract the <body>, a fast way is obtained with a user stream filter

<?php
// example of usage
$doc=new HtmlInc("http://mySearchEngine.net/results?q=".$_REQUEST['q']);
$slow=true;
?>
<html>
  <head>
    <?php $doc->meta() ?>
  </head>
  <body>
    <div id="header">My header</div>
    <p>My corpus</p>
    <?php
    if($slow) $doc->body(); // direct output to screen
    else echo $doc->body(''); // get body as string
    ?>
    <div id="header">My footer</div>
  </body>
</html>
*/
class HtmlInc { // the requested uri (maybe file) public $uri; // all html public $html; // just meta tags, public $meta; // metas as an array public $props; // the filename public $name; // A short label for the resource public $label; // A longer title public $title; // boolean, if remote public $http; // if an error when instantiate public $error;
/** Contructor, loads the , doesn't store the body by default. */
public function __construct($uri, $html=false) { if($html) $this->html=$html; // if no uri, let user instantiate object and use it like he wants if (!$uri) return; $this->uri=$uri; $this->name=$this->label=current(explode('.', basename($uri))); if(strpos($uri, "http://") === 0) return $this->http=true; if (!is_file($uri)) { $this->error='<p class="error">Page momentanément indisponible. <!-- '.$uri.' --></p>'; return false; } }
/** Load html, lazzy (only if needed) */
public function html($html=false) { if ($this->html) return $this->html; if (!$this->uri) { $this->error='<p class="error">Erreur interne (pas de fichier demandé).</p>'; return null; } $this->html=file_get_contents($this->uri); return $this->html; }
/** Parse head, lazzy.

Extractions has not the limitation of get_meta_tags() « PHP uses a native function to parse the input, so a Mac file won't work on Unix … Special characters in the value of the name property are substituted with '_' … If two meta tags have the same name, only the last one is returned ». Real work with metas needs to be system independant, full unicode, repeatable, see for example Dublin Core in HTML.

The structure of the array as three levels.

  • level 1 : key of the property : meta/@name | link/@rel (with namespace prefix stripped dc.title becomes title)
  • level 2 : index of value in declaration order
  • level 3 : values in PHP "PDO::FETCH_BOTH" style : « returns an array indexed by both column name and 0-indexed column number ».
    • String value (0=>, "string"=>) : meta/@content | link/@title.
    • Uri value (1=>, "href"=>) : link/@href
<head>
  <title>Article 13. III. Paix de Longjumeau. Édit de Paris. Édits de pacification.</title>
  <meta name="label" content="III, 13"/>
  <link rel="dc:isPartOf" href="." title="Édits de pacification"/>
  <link rel="DC.isPartOf" href="edit_03" title="III. Paix de Longjumeau. Édit de Paris"/>
</head>


Array (
  [title] => Array (
    [0] => Array (
      [0] => Article 13. III. Paix de Longjumeau. Édit de Paris. Édits de pacification.
      [string] => Article 13. III. Paix de Longjumeau. Édit de Paris. Édits de pacification.
  )
  )
  [label] => Array (
    [0] => Array (
      [0] => III, 13
      [string] => III, 13
    )
  )

  [isPartOf] => Array (
    [0] => Array (
      [0] => Édits de pacification
      [string] => Édits de pacification
      [1] => .
      [uri] => .
    )
    [1] => Array (
      [0] => III. Paix de Longjumeau. Édit de Paris
      [string] => III. Paix de Longjumeau. Édit de Paris
      [1] => edit_03
      [uri] => edit_03
    )
  )
)


  
*/
public function head() { if ($this->meta) return $this->meta; $head=self::headSub($this->html()); $this->props=array(); // keep title in memory $title=array(""); preg_match('/<title>([^<]+)<\/title>/i', $head, $title); if (isset($title[1])) $this->props['title'][]=array(0=>$title[1], "string"=>$title[1]); // grab all tags candidates preg_match_all("/<(meta|link)[^>]+>/i", $head, $meta, PREG_PATTERN_ORDER); // filter tags kown to not be metas $meta=preg_grep( "/stylesheet|http-equiv|icon/", $meta[0], PREG_GREP_INVERT); // loop on meta to populate the array foreach ($meta as $line) { preg_match('/(name|rel)="([^"]+)"/i', $line, $key); preg_match('/(content|title)="([^"]+)"/i', $line, $string); preg_match('/(scheme|href)="([^"]+)"/i', $line, $uri); if (!isset($key[2])) continue; // strip namespace prefix of property if ($pos=strpos($key[2], '.')) $key[2]=substr($key[2], $pos+1); if ($pos=strpos($key[2], ':')) $key[2]=substr($key[2], $pos+1); // all props supposed repeat if(isset($uri[2]) && isset($string[2])) $this->props[$key[2]][]=array(0=>$string[2], "string"=>$string[2], 1=>$uri[2], "uri"=>$uri[2]); else if(isset($uri[2])) $this->props[$key[2]][]=array(0=>$uri[2], "uri"=>$uri[2]); else if(isset($string[2])) $this->props[$key[2]][]=array(0=>$string[2], "string"=>$string[2]); } // rebuild a clean meta block ready to include in HTML $this->meta="\n " . @$title[0] . "\n " . implode("\n ", $meta); return $this->meta; }
/** Efficient cut of head */
public static function headSub($html) { if (!$start=stripos($html, "<head")) return ""; $start=strpos($html, ">", $start)+1; $to=stripos($html, "</head>"); if ($to) return substr($html, $start, $to - $start); else return substr($html, $start); }
/** Cut an html string to give only a body */
public static function bodySub($html) { if (!$start=stripos($html, "<body")) return $html; $start=strpos($html, ">", $start)+1; $to=stripos($html, "</body>"); if ($to) return substr($html, $start, $to - $start); else return substr($html, $start); } public function props() { $this->head(); return $this->props; }
/** Print meta to the stream $out (default) : output stream like a print or an echo $out resource : output stream to the resource (file or something else) $out "" : output as a String */
public function meta($out=null) { $this->head(); if (is_string($out)) return $this->meta; if (!is_resource($out)) $out=fopen("php://output", "w"); fwrite($out, $this->meta); }
/** Fix html body of some problems of wild html */
public static $fixhtml= array( '/<\/?font[^>]*>/i'=>'', '//i'=>'', '/ class="([^"]+)-western"/i' => ' class="$1"', '/&nbsp;/' => ' ', '/(<sup><\/?a[^>]*>)<sup>/i' => '$1', // fix specifique OOo '/<\/sup>(<\/a><\/sup>)/i' => '$1', // fix specifique OOo // '/ style="[^"]*"/i' => '', // pb for width );
/** Output body to the stream passed as argument $out (default) : output stream like a print or an echo $out resource : output stream to the resource (file or something else) $out "" : output as a String */
public function body($out=false) { // static call with an html string to cut if (is_string($out) && $out) return HtmlInc::bodySub( preg_replace(array_keys(self::$fixhtml), array_values(self::$fixhtml), $out)); // error already given, return it if ($this->error) { echo $this->error,' <!--',$this->uri, '-->'; return; } $html=self::bodySub($this->html()); $html=preg_replace(array_keys(self::$fixhtml), array_values(self::$fixhtml), $html); // imprévisible et non maîtrisé // if(class_exists("Diple")) $html=Diple::rewrite($html); // if caller wants a string, like body(""), send back a string // if (!is_resource($out)) $out=fopen("php://output", "w"); if (is_resource($out)) fwrite($out, $html); return $html; }
/** Try to find a not too bad short title. */
public function label() { if (isset($this->props['label'])) return $this->props['label'][0][0]; if (isset($this->props['title'])) return $this->props['title'][0][0]; return $this->label; } } ?>