<?php
/***************************************************************
*  Copyright notice
*
*  (c) 2018 Stefan Beyer SEDAT GmbH <stefan@sedat.de>
*  All rights reserved
*
*  This script is part of Cy4Marktzeitung. The Cy4Marktzeitung project is
*  free software; you can redistribute it and/or modify
*  it under the terms of the GNU General Public License as published by
*  the Free Software Foundation; either version 3 of the License, or
*  (at your option) any later version.
*
*  The GNU General Public License can be found at
*  http://www.gnu.org/copyleft/gpl.html.
*
*  This script is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU General Public License for more details.
*
*  This copyright notice MUST APPEAR in all copies of the script!
***************************************************************/
if (!defined('ENTRY_POINT')) die('Falscher Aufruf');


/**
 * Entfernt alle tags und attribute, sofern nicht explizit erlaubt.
 * Wird ein Block-Level-Element entfernt, so wird an den Inhalt
 * ein Leerzeichen angehängt, damit Inhalte nicht unerwartet aneinander kleben.
 *
 * $allowedTags = [
 *   'p'=>false,             // p-Tags ohne Attribute erlauben
 *   'img'=>true,            // img-Tags mit allen Attribute erlauben
 *   'a'=>['href', 'title'], // a-Tags mit href- und title-Attribut erlauben
 * ]
 *
 */
class HTMLCleaner {
  const REMOVE_MULTIPLE_BREAKS = 1;
  const REMOVE_BREAKS_BEFORE_BLOCKS = 2;
  const REMOVE_BREAKS_AFTER_BLOCKS = 4;
  const REMOVE_BREAKS_BEFORE_BLOCKENDS = 8;
  const REMOVE_BREAKS_AFTER_BLOCKSTARTS = 16;

  const FLAGS_ALL = self::REMOVE_MULTIPLE_BREAKS | self::REMOVE_BREAKS_BEFORE_BLOCKS | self::REMOVE_BREAKS_AFTER_BLOCKS | self::REMOVE_BREAKS_BEFORE_BLOCKENDS | self::REMOVE_BREAKS_AFTER_BLOCKSTARTS;

  private $flags;
  static $blockElements = [
    'p','div','ul','li','ul','ol','hr','br',
    'h1','h2','h3','h3','h4','h5','h6',
    'table','blockquote','pre','dl','dd','tfoot','thead',
    'header','footer','address','article','aside', 'section',
    'autio','video','canvas','fieldset','figure','figcapture','form',
    'hgroup', 'noscript', 'output',
  ];
  static $blockElementsRegEx = null;

  static $allowedTags = [];


  public function __construct($flags = self::FLAGS_ALL) {
    $this->flags = $flags;
  }

  public function clean($text) {
    //return strip_tags($text);
    if (is_null(self::$blockElementsRegEx)) {
      self::$blockElementsRegEx = implode('|', self::$blockElements);
    }

    if (strpos($text, '<') !== false) {
      $dom = new DOMDocument('1.0', 'UTF-8');
      $dom->loadHTML(
              '<?xml encoding="utf-8" ?><div>'.$text.'</div>',
                LIBXML_HTML_NOIMPLIED
              | LIBXML_HTML_NODEFDTD
              | LIBXML_NOBLANKS
              | LIBXML_NOCDATA
              | LIBXML_NOERROR
              );
      $this->walk($dom->documentElement);
      $result = $dom->saveXML($dom->documentElement);

      $result = preg_replace('`^<div>`mu', '', $result); // <div> am anfang
      $result = preg_replace('`</div>$`mu', '', $result); // </div> am ende

      if ($this->flags & self::REMOVE_MULTIPLE_BREAKS) {
        // nicht so schön, geht auch nur bis zu einem gewissen Grad.. ;)
        $result = preg_replace('`<br/>\s*<br/>`m', '<br/>', $result); // zwei <br/>s
        $result = preg_replace('`<br/>\s*<br/>`m', '<br/>', $result); // zwei <br/>s
        $result = preg_replace('`<br/>\s*<br/>`m', '<br/>', $result); // zwei <br/>s
        $result = preg_replace('`<br/>\s*<br/>`m', '<br/>', $result); // zwei <br/>s
      }

      if ($this->flags & self::REMOVE_BREAKS_BEFORE_BLOCKS) {
        $result = preg_replace('`<br/>\s*<(('.self::$blockElementsRegEx.')(\s+|>|/>))`mu', '<\\1', $result); //<br/> vor <ul>
      }
      if ($this->flags & self::REMOVE_BREAKS_AFTER_BLOCKS) {
        $result = preg_replace('`</('.self::$blockElementsRegEx.')>\s*<br/>`mu', '</\\1>', $result); //<br/> nach </ul>
      }

      if ($this->flags & self::REMOVE_BREAKS_BEFORE_BLOCKENDS) {
        $result = preg_replace('`<br/>\s*</('.self::$blockElementsRegEx.')>`mu', '</\\1>', $result); //<br/> nach </ul>
      }
      if ($this->flags & self::REMOVE_BREAKS_AFTER_BLOCKSTARTS) {
        $result = preg_replace('`<(('.self::$blockElementsRegEx.')[^>]*)>\s*<br/>`muU', '<\\1>', $result); //<br/> vor <ul>
      }

      $result = str_replace('<p></p>', '', $result); // leere p
      // TODO für alle Blocklevelelemente umsetzen. aber achtung!!! icht so, oder?
      //$result = preg_replace('`<('.self::$blockElementsRegEx.')></('.self::$blockElementsRegEx.')>`mu', '', $result); //<br/> vor <ul>

      $result = preg_replace('`^\s*<br/>`mu', '', $result); //<br/> am anfang
      $result = preg_replace('`<br/>\s*$`mu', '', $result); //<br/> am ende
    } else {
      $result = $text;
      //console('Skip cleaning');
    }

    return $result;
  }

  private function walk(&$node) {
    $child = $node->firstChild;
    while ($child) {
      //$child = $children->item($i);
      if ($child->nodeType !== XML_ELEMENT_NODE) {
        $child = $child->nextSibling;
        continue;
      }
      $name = $child->nodeName;
      if (array_key_exists($name, self::$allowedTags)) {
        $allowedAttributes = self::$allowedTags[$name];
        // attribute filtern
        if ($allowedAttributes !== true) {
          $attributes = $child->attributes;
          $removeAttributes = [];
          foreach ($attributes as $a) {
            $attr = $a->name;
            if ($allowedAttributes===false || !in_array($attr, $allowedAttributes)) {
              $removeAttributes[] = $attr;
            }
          }
          foreach ($removeAttributes as $a) {
            $child->removeAttribute($a);
          }
        }
        $this->walk($child);
        $child = $child->nextSibling;
        continue;
      }
      $fragment = $node->ownerDocument->createDocumentFragment();
      foreach ($child->childNodes as $ch) {
        $fragment->appendChild($ch->cloneNode(true));
      }
      if (in_array($name, self::$blockElements)) {
        $fragment->appendChild($node->ownerDocument->createTextNode(' ')); // damit durchs entfernen von tags nicht abstände verloren gehen ??
      }
      $this->walk($fragment);
      $next = $child->nextSibling;
      $child->parentNode->replaceChild($fragment, $child);
      $child = $next;
    }
  }

  static function resetAllowedTags() {
    self::$allowedTags = [];
  }

  static function allowTag($tagName, $attributes) {
    if (is_null($attributes)) {
      if (array_key_exists($tagName, self::$allowedTags)) {
        unset(self::$allowedTags[$tagName]);
      }
      return;
    }
    self::$allowedTags[$tagName] = $attributes;
  }

}
