Html2Text.php 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. <?php
  2. namespace Html2Text;
  3. if (!function_exists('mb_split'))
  4. {
  5. function mb_split($pattern, $subject, $limit = -1)
  6. {
  7. return preg_split('/'.$pattern.'/', $subject, $limit);
  8. }
  9. }
  10. /**
  11. * Replace all occurrences of the search string with the replacement string.
  12. *
  13. * @author Sean Murphy <sean@iamseanmurphy.com>
  14. * @copyright Copyright 2012 Sean Murphy. All rights reserved.
  15. * @license http://creativecommons.org/publicdomain/zero/1.0/
  16. * @link http://php.net/manual/function.str-replace.php
  17. *
  18. * @param mixed $search
  19. * @param mixed $replace
  20. * @param mixed $subject
  21. * @param int $count
  22. * @return mixed
  23. */
  24. function mb_str_replace($search, $replace, $subject, &$count = 0) {
  25. if (!is_array($subject)) {
  26. // Normalize $search and $replace so they are both arrays of the same length
  27. $searches = is_array($search) ? array_values($search) : array($search);
  28. $replacements = is_array($replace) ? array_values($replace) : array($replace);
  29. $replacements = array_pad($replacements, count($searches), '');
  30. foreach ($searches as $key => $search) {
  31. $parts = mb_split(preg_quote($search), $subject);
  32. $count += count($parts) - 1;
  33. $subject = implode($replacements[$key], $parts);
  34. }
  35. } else {
  36. // Call mb_str_replace for each subject in array, recursively
  37. foreach ($subject as $key => $value) {
  38. $subject[$key] = mb_str_replace($search, $replace, $value, $count);
  39. }
  40. }
  41. return $subject;
  42. }
  43. /******************************************************************************
  44. * Copyright (c) 2010 Jevon Wright and others.
  45. * All rights reserved. This program and the accompanying materials
  46. * are made available under the terms of the Eclipse Public License v1.0
  47. * which accompanies this distribution, and is available at
  48. * http://www.eclipse.org/legal/epl-v10.html
  49. *
  50. * or
  51. *
  52. * LGPL which is available at http://www.gnu.org/licenses/lgpl.html
  53. *
  54. *
  55. * Contributors:
  56. * Jevon Wright - initial API and implementation
  57. * Denis Flaven - some fixes for properly handling UTF-8 characters
  58. ****************************************************************************/
  59. class Html2Text {
  60. /**
  61. * Tries to convert the given HTML into a plain text format - best suited for
  62. * e-mail display, etc.
  63. *
  64. * <p>In particular, it tries to maintain the following features:
  65. * <ul>
  66. * <li>Links are maintained, with the 'href' copied over
  67. * <li>Information in the &lt;head&gt; is lost
  68. * </ul>
  69. *
  70. * @param string html the input HTML
  71. * @return string the HTML converted, as best as possible, to text
  72. * @throws Html2TextException if the HTML could not be loaded as a {@link DOMDocument}
  73. */
  74. static function convert($html) {
  75. // replace &nbsp; with spaces
  76. $html = str_replace("&nbsp;", " ", $html);
  77. $html = mb_str_replace("\xc2\xa0", " ", $html); // DO NOT USE str_replace since it breaks the "à" character which is \xc3 \xa0 in UTF-8
  78. $html = static::fixNewlines($html);
  79. $doc = new \DOMDocument();
  80. if (!@$doc->loadHTML('<?xml encoding="UTF-8">'.$html)) // Forces the UTF-8 character set for HTML fragments
  81. {
  82. throw new Html2TextException("Could not load HTML - badly formed?", $html);
  83. }
  84. $output = static::iterateOverNode($doc);
  85. // remove leading and trailing spaces on each line
  86. $output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output);
  87. $output = preg_replace("/ *\t */im", "\t", $output);
  88. // remove unnecessary empty lines
  89. $output = preg_replace("/\n\n\n*/im", "\n\n", $output);
  90. // remove leading and trailing whitespace
  91. $output = trim($output);
  92. return $output;
  93. }
  94. /**
  95. * Unify newlines; in particular, \r\n becomes \n, and
  96. * then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
  97. * all become \ns.
  98. *
  99. * @param string text text with any number of \r, \r\n and \n combinations
  100. * @return string the fixed text
  101. */
  102. static function fixNewlines($text) {
  103. // replace \r\n to \n
  104. $text = str_replace("\r\n", "\n", $text);
  105. // remove \rs
  106. $text = str_replace("\r", "\n", $text);
  107. return $text;
  108. }
  109. static function nextChildName($node) {
  110. // get the next child
  111. $nextNode = $node->nextSibling;
  112. while ($nextNode != null) {
  113. if ($nextNode instanceof \DOMElement) {
  114. break;
  115. }
  116. $nextNode = $nextNode->nextSibling;
  117. }
  118. $nextName = null;
  119. if ($nextNode instanceof \DOMElement && $nextNode != null) {
  120. $nextName = strtolower($nextNode->nodeName);
  121. }
  122. return $nextName;
  123. }
  124. static function prevChildName($node) {
  125. // get the previous child
  126. $nextNode = $node->previousSibling;
  127. while ($nextNode != null) {
  128. if ($nextNode instanceof \DOMElement) {
  129. break;
  130. }
  131. $nextNode = $nextNode->previousSibling;
  132. }
  133. $nextName = null;
  134. if ($nextNode instanceof \DOMElement && $nextNode != null) {
  135. $nextName = strtolower($nextNode->nodeName);
  136. }
  137. return $nextName;
  138. }
  139. static function iterateOverNode($node) {
  140. if ($node instanceof \DOMText) {
  141. // Replace whitespace characters with a space (equivilant to \s)
  142. return preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
  143. }
  144. if ($node instanceof \DOMDocumentType) {
  145. // ignore
  146. return "";
  147. }
  148. $nextName = static::nextChildName($node);
  149. $prevName = static::prevChildName($node);
  150. $name = strtolower($node->nodeName);
  151. // start whitespace
  152. switch ($name) {
  153. case "hr":
  154. return "---------------------------------------------------------------\n";
  155. case "style":
  156. case "head":
  157. case "title":
  158. case "meta":
  159. case "script":
  160. // ignore these tags
  161. return "";
  162. case "h1":
  163. case "h2":
  164. case "h3":
  165. case "h4":
  166. case "h5":
  167. case "h6":
  168. case "ol":
  169. case "ul":
  170. // add two newlines, second line is added below
  171. $output = "\n";
  172. break;
  173. case "td":
  174. case "th":
  175. // add tab char to separate table fields
  176. $output = "\t";
  177. break;
  178. case "tr":
  179. case "p":
  180. case "div":
  181. // add one line
  182. $output = "\n";
  183. break;
  184. case "li":
  185. $output = "- ";
  186. break;
  187. default:
  188. // print out contents of unknown tags
  189. $output = "";
  190. break;
  191. }
  192. // debug
  193. //$output .= "[$name,$nextName]";
  194. if (isset($node->childNodes)) {
  195. for ($i = 0; $i < $node->childNodes->length; $i++) {
  196. $n = $node->childNodes->item($i);
  197. $text = static::iterateOverNode($n);
  198. $output .= $text;
  199. }
  200. }
  201. // end whitespace
  202. switch ($name) {
  203. case "h1":
  204. case "h2":
  205. case "h3":
  206. case "h4":
  207. case "h5":
  208. case "h6":
  209. $output .= "\n";
  210. break;
  211. case "p":
  212. case "br":
  213. // add one line
  214. if ($nextName != "div")
  215. $output .= "\n";
  216. break;
  217. case "div":
  218. // add one line only if the next child isn't a div
  219. if ($nextName != "div" && $nextName != null)
  220. $output .= "\n";
  221. break;
  222. case "a":
  223. // links are returned in [text](link) format
  224. $href = $node->getAttribute("href");
  225. $output = trim($output);
  226. // remove double [[ ]] s from linking images
  227. if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
  228. $output = substr($output, 1, strlen($output) - 2);
  229. // for linking images, the title of the <a> overrides the title of the <img>
  230. if ($node->getAttribute("title")) {
  231. $output = $node->getAttribute("title");
  232. }
  233. }
  234. // if there is no link text, but a title attr
  235. if (!$output && $node->getAttribute("title")) {
  236. $output = $node->getAttribute("title");
  237. }
  238. if ($href == null) {
  239. // it doesn't link anywhere
  240. if ($node->getAttribute("name") != null) {
  241. $output = "[$output]";
  242. }
  243. } else {
  244. if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") {
  245. // link to the same address: just use link
  246. $output;
  247. } else {
  248. // replace it
  249. if ($output) {
  250. $output = "[$output]($href)";
  251. } else {
  252. // empty string
  253. $output = $href;
  254. }
  255. }
  256. }
  257. // does the next node require additional whitespace?
  258. switch ($nextName) {
  259. case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
  260. $output .= "\n";
  261. break;
  262. }
  263. break;
  264. case "img":
  265. if ($node->getAttribute("title")) {
  266. $output = "[" . $node->getAttribute("title") . "]";
  267. } elseif ($node->getAttribute("alt")) {
  268. $output = "[" . $node->getAttribute("alt") . "]";
  269. } else {
  270. $output = "";
  271. }
  272. break;
  273. case "li":
  274. $output .= "\n";
  275. break;
  276. default:
  277. // do nothing
  278. }
  279. return $output;
  280. }
  281. }