set_time_limit(-1);classOdt{/** SQLite link, may be used for some lookup */
public static $pdo;/** un log d'erreur */
public static $log;
public static functiontei($odt,$destFile=null){$xml=self::odtx($odt,$destFile);$params=array();if($destFile)$params['filename']=preg_replace('/\.[^\.]*$/','',basename($destFile));$xml=self::xsl($xml,dirname(__FILE__).'/odt_tei.xsl',$params);// regularisation of tags segments, ex: spaces tagged as italic
$preg=self::sed_preg(file_get_contents(dirname(__FILE__).'/tei.sed'));$xml=preg_replace($preg[0],$preg[1],$xml);// xsl step after regularisation of tags segments : index
$xml=self::xsl($xml,dirname(__FILE__).'/tei_post.xsl');if($destFile)file_put_contents($destFile,$xml);return$xml;}
public static functioncorr($odt){$xml=self::tei($odt);$xml=self::xsl($xml,dirname(__FILE__).'/tei_corr.xsl');return$xml;}
public static functionphilo3($odt){$xml=self::tei($odt);$xml=self::xsl($xml,dirname(__FILE__).'/tei_philo3.xsl');return$xml;}
public static functionodtx($odt,$destFile=null){if(!extension_loaded("zip")){echo'<p class="error">Cette fonction nécessite l\'extension PHP zip.</p>';return;}$zip=newZipArchive();if(!$zip->open($odt)){echo'<p class="error">'.$odt.' non trouvé.</p>';returnfalse;}$xml='';$xml .= $zip->getFromName('meta.xml');$xml .= $zip->getFromName('styles.xml');$xml .= $zip->getFromName('content.xml');$preg=self::sed_preg(file_get_contents(dirname(__FILE__).'/odtx.sed'));$xml=preg_replace($preg[0],$preg[1],$xml);// envelopper d'un élément racine
$xml='<?xml version="1.0" encoding="UTF-8"?>
<office:document xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0">
'.$xml."\n</office:document>";// si dossier de destination, y extraire les images
if($destFile){$destName=preg_replace('/\.[^\.]*$/','',basename($destFile));$destDir=dirname($destFile).'/'.$destName;$entries=array();for($i=$zip->numFiles-1;$i>=0;$i--){if(strpos($zip->getNameIndex($i),'Pictures/')!==0)continue;$entries[]=$zip->getNameIndex($i);echo'<textarea> coucou ?';print_r($entries);echo'</textarea>';}// if (!is_dir($destDir)) mkdir($destDir, 0775, true);
// $zip->extractTo($destDir);
// file_put_contents($destFile, $xml);
}$zip->close();return$xml;}
public static functionsed_preg($script){$search=array();$replace=array();$lines=explode("\n",$script);$lines=array_filter($lines,'trim');foreach($linesas$l){if($l[0]!='s')continue;
list($a,$s,$r)=explode($l[1],$l);$search[]=$l[1].$s.$l[1].'u';$replace[]=preg_replace('/\\\\([0-9]+)/','\\$$1',$r);}returnarray($search,$replace);}
public static functionhtml($odt){$xml=self::tei($odt);$xsl=dirname(__FILE__)."/../../transform/tei_html.xsl";if(file_exists($xsl))returnself::xsl($xml,$xsl);returnself::xsl($xml,"http://subversion.cru.fr/diple/trunk/transform/tei_html.xsl");}
public static functionactesroyaux($odt){$xml=self::tei($odt);return$xml;}
public static functionngml($odt){$xml=self::norm($odt);$preg=array('@ type="N.G.M.L."@' => '',// attribut non significatif
'@</?(ref|bg_)[^>]*>@' => '',// surlignages, liens copiés/collés
'@<num>id.</num>@' => '<name>id.</name>',// id. = 499
// sortir tout ce qu'on peut de l'italique pour qu'il ne reste que les définitions. La stratégie consiste à encadrer les termes connus d'une balise, et à sortir ensuite ces balise de l'italiques (sans oublier de nettoyer les ponctuations qui restent)
'@ <hi>@' => '<hi> ',// assurer un espace en début d'italique
'@(?<=[ ])(cf\.|f\.|m\.|n\.|v\.|v\. lect\.|var\. lect\.|s\.? ?v\.|sc\.)@' => '<abbr>$1</abbr>',// abréviations non encore repérées
'@(?<=[ ])(\(?)(absol\.|au fig\.|au plur\.|au propre|au sing\.|avec gén\.|avec inf\.|dans l\'expr\.|dans les expr\.|en général|en part\.|fautif pour|par ext\.|par métaph\.|par méton\.|péj\.|sens abstrait|sens actif|sens concret|sens passif|spéc\.")(\)?)@' => '<usg>$1$2$3</usg>','@(?<!>)\([a-zéèêâ]+\.\)@u' => '<domain>$0</domain>',// mot entre parenthèse pas encore balisé, drapeau u pour unicode
'@(?<=[ ])(adj\.|adv\.|comp\.|indécl\.|sive|superl\.)@' => '<foreign>$1</foreign>',// abréviations indiquées dans les règles
'@(?<=<hi>)([ ]*)<(abbr|domain|foreign|usg)>([^<]+)</\2>([ ]*)@' => '</hi>$1<$2>$3</$2>$4<hi>',// assertion arrière nécessaire pour que le suivant marche
'@<hi>([ ]*)<(abbr|usg|foreign)>([^<]+)</\2>([ ]*)@' => '$1<$2>$3</$2>$4<hi>','@([ ]*)<(abbr|usg|foreign)>([^<]+)</\2>([ ]*)</hi>@' => '</hi>$1<$2>$3</$2>$4','@<hi>([, :;]+)@u' => '$1<hi>',// sortir l'espace ajouté plus haut
'@<hi>[\(\) :]*</hi>@u' => '',// '(<hi>forme[^<]*</hi>)' => '',
// '@<hi>([^<\n]+)</hi>( | ):@' => '<def>$1</def> :', // définition
// Renvois
'@\(?DuC\)?@' => '<ref>$0</ref>',// Du Cange
'@FEW *<num>[^<]+</num>( *[0-9pb\.\-]+)?@' => '<ref>$0</ref>',// il n'y a pas toujours la page'
'@(?<!TLL )<abbr>(v\.|s\.v\.)</abbr> ([^\]\[: <]+)@u' => '<xr rend="$1">$2</xr>',// lien simple
'@(</xr> *<hi>et</hi> )([^<]+)@u' => '$1<xr>$2</xr>',// autres liens
'@</xr>( *<hi>[IV]+</hi>)@' => '$1</xr>',// <xr rend="s.v.">piscarius</xr> <hi>IV</hi> => <xr rend="s.v.">piscarius <hi>IV</hi></xr>
'@([\.\]\( ]+)</xr>@u' => '</xr>$1',// régularisations de la ponctuation en italique
// Gras : vedettes, n°
'@<p>\s*<ident>([1-9]\. )?([a-z][a-zç\(\)]+)</ident>@u' => '<p><orth>$1$2</orth>',// vedette principale
'@</body>@' => ' </entry>
</body>',// clore le dernier article
'@<p><orth>@' => ' </entry>
<entry>
<p><orth>',// article
'@<body>\n </entry>@' => '<body>
',// premier article
'@<ident>(.|II|III|IV|VI|VII|VIII)</ident>@u' => "\n".'<n>$1</n>',// numéro de sens, unicode pour alpha, beta
'@<ident>([^<]+)</ident>@u' => '<orth>$1</orth>',// autres vedettes
'@(: )([a-z]*?)( :)@' => '$1<orth>$2</orth>$3',// un mot seul entre deux séries de deux points est une forme
'@(<hi>forme</hi> )([a-z]*?)( :)@' => '$1<orth>$2</orth>$3',// crochets : étymologies, dates
'@[\[\(]([ac]\. )?[0-9-—–]+[\]\)]@u' => '<date>$0</date>',// [1964] => <date>[1964]</date>
'@ [ac]\. [0-9]{3,4}@' => ' <date>$0</date>',// Carta <date>a. 1114</date>
'@(?<!>)\[.*?\](?!</date>)@'=> '<etym>$0</etym>',// indication étymologique, les assertions évitent le surbalisage
'@<etym>(\[[0-9\.p ]+\])</etym>@u' => '$1',// <etym>[p. 120]</etym> => [p. 120]
'@(\([^(]*?)<etym>(\[.*?\])</etym>@' => '$1$2',// (Papias* f. 173vb… <etym>[cf. Guill. Brit. summa p. 555]</etym>) => (Papias* f. 173vb… [cf. Guill. Brit. summa p. 555]) hiérarchie crochets dans parenthèses, ungreedy!
// remettre dans les <phr> les lettres qui en sont sorties
'@</phr>([a-z]{1})@' => '$1</phr>',// référence bibliographique
'@ ib\.@' => ' <name>ib.</name>',// ib. => <name>ib.</name>
// <name>Papias</name> s.v. nuces. <name>Coll.</name> Salern => <bibl><name>Papias</name> s.v. nuces.</bibl> <bibl><name>Coll.</name> Salern</bibl> :
'@[\[\(]\?[\]\)]@' => '⟨?⟩',// dans certaines références bibliographiques
'@(<name>.+?)(?=:|<bg|<hi>|<def>|<etym>|<n>|<name>|<note|<usg|<xr|\]</etym>|\][^<]|</p>)@' => '<bibl>$1</bibl>',// référence bibliographique précédée d'un sigle, joker ungreedy, et référence arrière, notamment pour les énumérations de sigles
'@(\([^\)\<\>]+)</bibl>( *\))@' => '$1$2</bibl>',// <bibl><name>Cartul.</name> Pared. Mon. 159 p. 79 (XI s.</bibl>) => <bibl><name>Cartul.</name> Pared. Mon. 159 p. 79 (XI s.)</bibl>
'@([ ]+)</bibl>@u' => '</bibl>$1',// </bibl>: => </bibl> :
'@</bibl>\.@' => '.</bibl>',// </bibl>. => .</bibl>
'@</bibl>\]@' => ']</bibl>',// </bibl>. => .</bibl>
'@<name>(Carta|Chron.)</name> <date>[^<]+</date>( <ref>\(DuC\)</ref>)?@' => '<sigle>$0</sigle>',// Sigles spéciaux
'@</bibl> : *(...*?)(?=:[ ][^a-z«‑]|.<abbr>cf.|<bibl>|<etym>|\]</etym>| <hi|<n>|<note|<phr>|</p>|<usg|<xr|\r|\n)@' => '</bibl> : <quote>$1</quote>',// suppose espace insécable avant :, ungreedy, inclus des balises, marquer <etym> avant
// Suppression des séquences <quote><bibl>
'@<quote><bibl>(.+?)</quote>@' => '<bibl>$1',// <quote><bibl><name>Cartul.</name> Cupersan. 42 p. 94, 17 <date>(a. 1054)</date>.</bibl> <hi>(en Italie méridionale) drap, tissu ou rideau de literie</hi></quote>
'@<quote><xr(.+?)</quote>@' => '<xr$1',// <quote><xr rend="v.">1. penna</xr> <hi>IA1. </quote>
// motifs flexionnels, <etym> sert de borne
'@</orth>([^>]+<abbr>[fmn]\.</abbr>)@' => '</orth><gram>$1</gram>',// morphologie des substantifs
'@</orth>([ ,][^\<\>]+)@' => '</orth><gram>$1</gram>',// <orth>pistus</orth>, ‑a, ‑um => <orth>pistus</orth><gram>, ‑a, ‑um</gram>, adjectifs, suivent toujours la vedette (attention au rappel des vedettes par leur flexion)
'@\[</gram>@' => '</gram>[',// sortir les indications grammaticales des indications étymologiques
// Nettoyage
'@<gram>([ ,]+)@' => '$1<gram>',// nettoyer après
'@ +</gram>@' => '</gram> ',// nettoyer après, si pas toujours ,
);$xml=preg_replace(array_keys($preg),array_values($preg),$xml);// pour matcher et valider les sigles
$base=dirname(__FILE__).'/scriptores.sqlite';if(file_exists($base)){self::$keyStop=array("Arist. eth."=>"","Aug. c. acad."=>"","Aug. civ."=>"","Aug. cons. evang."=>"","Aug. enchir."=>"","Aug. in evang. Ioh."=>"","Aug. in psalm."=>"","Aug. retract."=>"","Aug. serm."=>"","Aug. tract."=>"","Aug. vera relig."=>"","Beda hex."=>"","Beda homil. evang."=>"","Beda metr."=>"","Beda tabern."=>"","Beda temp. rat."=>"","Cassiod. in psalm."=>"","Char. gramm."=>"","Cic. inv."=>"","Cic. Lael."=>"","Diosc."=>"","Fest."=>"","Fulg. myth."=>"","Hier. c. Ioh."=>"","Hier. epist."=>"","Hier. in Dan."=>"","Hier. in Is."=>"","Hier. in Matth."=>"","Hier. in psalm."=>"","Hier. nom. hebr."=>"","Hier. onom. num."=>"","Hier. pref. Vulg. Ezech."=>"","Hier. vita Pauli"=>"","Hor. ars"=>"","Hor. carm. I"=>"","Hor. sat."=>"","Hyg. fab."=>"","Hygin. astron."=>"","Isid. diff."=>"","Isid. etym."=>"","Isid. reg. monach."=>"","Mart."=>"","Mart. Cap."=>"","Mart. Cap. I"=>"","Mart. epigr."=>"","Max. Conf."=>"","Max. Taur."=>"","Plaut. Aul."=>"","Plaut. Capt."=>"","Plin. epist."=>"","Prisc."=>"","Prisc. gramm."=>"","Prisc. gramm. II"=>"","Rufin. hist."=>"","Rufin. Orig. in gen."=>"","Rufin. Orig. in Rom. pref. Rufin."=>"","Rufin. patr."=>"","Sen. epist."=>"","Serv. ecl."=>"","Sidon. epist."=>"","Suet. Nero"=>"","Tert. orat."=>"","Varro"=>"","Vell."=>"","Ven. Fort. carm."=>"","Ven. Fort. vita Radeg."=>"","Verg. Aen."=>"","Verg. georg."=>"","Vet. Lat. Eph."=>"","Vet. Lat. exod."=>"","Vet. Lat. Sirach"=>"",);// connexion à la base
self::$pdo=newPDO('sqlite:'.dirname(__FILE__).'/scriptores.sqlite');// préparer les requêtes de lookup
self::$keyLike=self::$pdo->prepare('SELECT sigle FROM siglae WHERE norm LIKE ?');// indiquer le nom de l'élement encadrant
self::$keyEl="sigle";// ouvrir un flux pour le log des clés non trouvées
self::$keyOut=fopen("php://output","w");// exécuter une expression régulière sur le motif supposé de sigle avec ‘callback’ sur la fonction qui fera le lookup
print"<!-- ";$xml=preg_replace_callback('@(?<!<sigle>)<name>.*?(?=</bibl>| <| s.v.| p\.| col\.| t\.| f\.| P [0-9<]| [0-9]| p[0-9]| \(éd.)@',array(__CLASS__,'key'),// convention PHP spéciale pour appeler classe statique
$xml);print"-->";if(is_resource(self::$keyOut))fclose(self::$keyOut);}// $xml=self::xsl($xml, dirname(__FILE__).'/tei_ngml.xsl');
return$xml;}
static $keyEl="name";
static $keyExact;
static $keyLike;
static $keyStop=array();
static $keyOut;
static $keyCount=1;
public static functionkey($matches){$preg=array('@<[^>]+>@' => '','@[\*\(\)\?⟨⟩]@u' => '','@^ib\..*@' => '','@[ :\(]+$@u' => '','@\s+@' => ' ',);$regex=array_keys($preg);$replace=array_values($preg);$value=$matches[0];$after="";// what to append on the value matched
$i=10;// loop limit
// loop on the string to test
while($value&&$i){$count=0;// let here, in case of stop key
$key=preg_replace($regex,$replace,$value);if(!$key)break;// key in a stop list, no base record, should I tag ?
if(isset(self::$keyStop[$key])){break;$key="";}self::$keyLike->execute(array($key));// try exact
$values=self::$keyLike->fetchAll(PDO::FETCH_COLUMN);if(!count($values)){self::$keyLike->execute(array(rtrim($key,'.').'%'));// try prefix
$values=self::$keyLike->fetchAll(PDO::FETCH_COLUMN);}$count=count($values);if($count==1){// exit, it's OK
$key=$values[0];break;}if($count>1){// more than one value, log it
$key=$key."*";if(is_resource(self::$keyOut)){fwrite(self::$keyOut,"\n".self::$keyCount++.' "'.preg_replace($regex,$replace,$matches[0]).'" : ');if($count<10)fwrite(self::$keyOut,implode($values,', '));elsefwrite(self::$keyOut,$key." (".count($values).')');}break;}// nothing found, cut
$i--;// cut $value on space
$pos=strrpos($value,' ');$after=" ".substr($value,$pos+1).$after;$value=substr($value,0,$pos);// validation pb
if(!$pos||!$value||(strrpos($value,'<name>')!==false&&!strrpos($value,'</name>'))){$value=$matches[0];$after="";if(is_resource(self::$keyOut))fwrite(self::$keyOut,"\n".self::$keyCount++.' "'.preg_replace($regex,$replace,$matches[0]).'" : ?');$key="?";break;}}$att="";if($key)$att=' key="'.$key.'"';return'<'.self::$keyEl.$att.'>'.$value.'</'.self::$keyEl.'>'.$after;}
public static functionxsl($xml,$xsl_file,$params=null){$dom=newDOMDocument("1.0","UTF-8");$dom->loadXML($xml);// on pourrait optimiser en cas de plusieurs appels
$xsl=newDOMDocument();// ? tester l'existence du fichier ?
$xsl->load($xsl_file);// TODO, enregistrer les fonctions php , avec les objets
$proc=newXSLTProcessor();$proc->importStyleSheet($xsl);// transpose params
if($params&&count($params))foreach($paramsas$key => $value)$proc->setParameter('',$key,$value);return$proc->transformToXML($dom);}
static functionerror_handler($errno,$errstr,$errfile,$errline,$errcontext){self::$log .= $errstr."\n";}}// included file, do nothing
if(basename($_SERVER['SCRIPT_FILENAME'])!=basename(__FILE__));// direct command line call, work
elseif(php_sapi_name()=="cli"){array_shift($_SERVER['argv']);// shift first arg, the script filepath
if(!count($_SERVER['argv']))exit('
usage : php -f Odt.php src.odt format? dest/?
src.odt : glob patterns are allowed, but in quotes, to not be expanded by shell "folder/*.odt"
format? : optional dest format, default tei, others may be odtx, html, ngml
');$glob=array_shift($_SERVER['argv']);$format=array_shift($_SERVER['argv']);if(!$format)$format="tei";$ext=".$format";if($ext=='.tei')$ext=".xml";foreach(glob($glob)as$odt){$dest=dirname($odt).'/'.basename($odt,".odt").$ext;print"$odt > $dest\n";file_put_contents($dest,call_user_func_array(array("Odt",$format),array($odt)));}}// direct http call, work
else{if(!count($_FILES))exit;// upload de fichier
reset($_FILES);$tmp=current($_FILES);// pas de fichier envoyé
if(!$file=$tmp['tmp_name'])exit;if(isset($_REQUEST['format']))$format=$_REQUEST['format'];else$format="tei";// sortir du xml
if(isset($_REQUEST['download'])){header("Content-Type: text/xml");if(isset($tmp)){$name=$tmp['name'];$name=substr($name,0,strrpos($name,'.'));}else$name="odt_tei.xml";header('Content-Disposition: attachment; filename="'.$name.'.xml"');}elseif($format=='html')header("Content-Type: text/html; charset=UTF-8");// chrome do not like text/xml
else{header("Content-Type: text/plain; charset=UTF-8");}$xml=call_user_func_array(array("Odt",$format),array($file));// réindenter, plus pratique à voir
$dom=newDOMDocument("1.0","UTF-8");$dom->formatOutput=true;$dom->preserveWhiteSpace=false;// dangereux ?
$dom->recover=true;// rediriger les erreurs
$oldError=set_error_handler(array('Odt',"error_handler"),E_ALL);$dom->loadXML($xml,LIBXML_NOENT|LIBXML_NONET|LIBXML_NSCLEAN|LIBXML_NOCDATA|LIBXML_COMPACT|LIBXML_PARSEHUGE);restore_error_handler();// sortie indentée
echo$dom->saveXML();if(Odt::$log){echo"<!--\n",Odt::$log,'-->';}}?>