HTML Tabellen Tabellen lassen sich in PHP gut weiterverarbeiten, wenn man sie in ein PHP Array parsen kann. Irgendwie hab ich kein gute Klassen gefunden, die noch einigermaßen moderne Techniken nutzt. So dass ich kurz ein Klassen für meine Bedürfnissen zusammen geschrieben habe. Sie basiert auf DOMDocument und nicht wie viele anderen auf irgendwelchen Regulären Ausdrücke, die bei nicht konformen Tabellen schnell den Dienst einstellen.
Features
- Header / Titelfelder werden anhand Bilder und Link Attribute gesetzt, wenn kein Text Element vorhanden.
- Tabelleinhalt kann als reiner Text oder als HTML ausgegeben werden
- Titelfelder werden als Keys im PHP Array genutzt
VCS File:
/trunk/DomTableExtractor/example.php
SVN Browser: phptools -> /trunk/DomTableExtractor/example.php
<?php
// $Id: example.php 13 2011-11-05 12:54:10Z espendiller $
// $HeadURL: file:///var/svn-repos/phptools/trunk/DomTableExtractor/example.php $
$table = '
<table class="myclass">
<thead>
<tr>
<th>Header1</th>
<th>Header2</th>
<th><img src="#" title="Header3"></th>
<th>Header4</th>
</tr>
</thead>
<tbody>
<tr>
<td>Lorum ipsum dolor sit amet</td>
<td>Lorum ipsum dolor sit amet</td>
<td>Lorum ipsum dolor sit amet</td>
<td>Lorum ipsum dolor sit amet</td>
</tr>
<tr>
<td>Lorum ipsum dolor sit amet</td>
<td>Lorum ipsum dolor sit amet</td>
<td>Lorum ipsum dolor sit amet</td>
<td>Lorum ipsum dolor sit amet</td>
</tr>
</tbody>
</table>
';
include("DomTableExtractor.php");
$test = new DomTableExtractor();
$test->findHeaderOnTags();
$test->findHeaderOnTags();
$test->CombineHeaders();
$ar = $test->GetData($table, '@class="myclass"');
print_r($ar);
/*
Array
(
[0] => Array
(
[Header1] => Lorum ipsum dolor sit amet
[Header2] => Lorum ipsum dolor sit amet
[Header3] => Lorum ipsum dolor sit amet
[Header4] => Lorum ipsum dolor sit amet
)
[1] => Array
(
[Header1] => Lorum ipsum dolor sit amet
[Header2] => Lorum ipsum dolor sit amet
[Header3] => Lorum ipsum dolor sit amet
[Header4] => Lorum ipsum dolor sit amet
)
)
*/
?>/trunk/DomTableExtractor/DomTableExtractor.php
SVN Browser: phptools -> /trunk/DomTableExtractor/DomTableExtractor.php
<?php
// $Id: DomTableExtractor.php 14 2011-11-05 13:05:56Z espendiller $
// $HeadURL: file:///var/svn-repos/phptools/trunk/DomTableExtractor/DomTableExtractor.php $
/**
* This class converts a HTML table with headers or not to a php array which can also filter
* It uses DOMDocument and XPath
*
*/
class DomTableExtractor {
private $headersToLower = false;
private $combineHeaders = false;
private $filterOutput = false;
private $findHeaderOnTags = false;
private $DeleteFirstRows = 0;
public function __construct($headersToLower = false, $combineHeaders = false, $filterOutput = false) {
$this->headersToLower = $headersToLower;
$this->combineHeaders = $combineHeaders;
$this->filterOutput = $filterOutput;
}
public function GetData($html, $selector) {
// http://stackoverflow.com/questions/7429136/get-a-complete-table-with-php-domdocument-and-print-it
$dom = $this->filter_dom_load($html);
$xpath = new DOMXPath($dom);
$table = $xpath->query('//table[' . $selector . ']')->item(0);
// for printing the whole html table just type: print $xml->saveXML($table);
$xpath = new DOMXPath($dom);
$tableArray = array();
foreach ($xpath->query('//table[' . $selector . ']//tr') as $row) {
// header filter
// $row -> getElementsByTagName('td');
$tds = new DOMXPath($this->filter_dom_load($this->innerHTML($row)));
$cells = $tds->query('//td|//th');
$td = array();
foreach ($cells as $cell) {
// get raw data or text only
$td[] = filterOutput == true ? $this->innerHTML($cell) : $cell->NodeValue;
}
$tableArray[] = $td;
}
if ($this->DeleteFirstRows > 0) {
$tableArray = array_slice($tableArray, $this->DeleteFirstRows);
}
if ($this->combineHeaders == true) {
return $this->_CombineHeaders($tableArray);
}
return $tableArray;
}
private function _filterOutput($headerNames) {
return trim(html_entity_decode(strip_tags($headerNames)));
}
private function _findHeaderOnTags($header, $unFilterdHeader) {
if (strlen($header) == 0) {
return preg_match('/(?:alt|title)="([^"]*)"/i', $unFilterdHeader, $match) ? $match[1] : 'empty';
}
return $header;
}
private function _CombineHeaders($tableArray) {
// first row is header so remove it and get it
$headerNames = array_shift($tableArray);
if ($this->headersToLower == true) {
$headerNames = array_map('strtolower', $headerNames);
}
// headers can contain html; filter it for headers / array keys
if ($this->filterOutput == false) {
$unFilterdHeader = $headerNames;
$headerNames = array_map('self::_filterOutput', $headerNames);
}
if ($this->findHeaderOnTags == true) {
$headerNames = array_map("self::_findHeaderOnTags", $headerNames, $unFilterdHeader);
}
// map key (headers) and value
foreach ($tableArray as $rowData) {
if (count($headerNames) == count($rowData)) {
$data[] = array_combine($headerNames, $rowData);
} else {
// colspan or rowspan or invalid table!?
}
}
return $data;
}
// http://api.drupal.org/api/drupal/modules--filter--filter.module/function/filter_dom_load/7
private function filter_dom_load($text) {
$dom_document = new DOMDocument();
// Ignore warnings during HTML soup loading.
@$dom_document->loadHTML('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>' . $text . '</body></html>');
return $dom_document;
}
// http://www.php.net/manual/en/book.dom.php#105815
private function innerHTML($el) {
$doc = new DOMDocument();
$doc->appendChild($doc->importNode($el, TRUE));
$html = trim($doc->saveHTML());
$tag = $el->nodeName;
return preg_replace('@^<' . $tag . '[^>]*>|</' . $tag . '>$@', '', $html);
}
public function HeadersToLower() {
$this->headersToLower = true;
}
public function findHeaderOnTags() {
$this->findHeaderOnTags = true;
}
public function CombineHeaders() {
$this->combineHeaders = true;
}
public function DeleteFirstRows($numrows) {
$this->DeleteFirstRows = $numrows;
}
}
?>