Html2Text.php 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. <?php
  2. namespace Html2Text;
  3. /**
  4. * Replace all occurrences of the search string with the replacement string.
  5. *
  6. * @author Sean Murphy <sean@iamseanmurphy.com>
  7. * @copyright Copyright 2012 Sean Murphy. All rights reserved.
  8. * @license http://creativecommons.org/publicdomain/zero/1.0/
  9. * @link http://php.net/manual/function.str-replace.php
  10. *
  11. * @param mixed $search
  12. * @param mixed $replace
  13. * @param mixed $subject
  14. * @param int $count
  15. * @return mixed
  16. */
  17. function mb_str_replace($search, $replace, $subject, &$count = 0) {
  18. if (!is_array($subject)) {
  19. // Normalize $search and $replace so they are both arrays of the same length
  20. $searches = is_array($search) ? array_values($search) : array($search);
  21. $replacements = is_array($replace) ? array_values($replace) : array($replace);
  22. $replacements = array_pad($replacements, count($searches), '');
  23. foreach ($searches as $key => $search) {
  24. $parts = mb_split(preg_quote($search), $subject);
  25. $count += count($parts) - 1;
  26. $subject = implode($replacements[$key], $parts);
  27. }
  28. } else {
  29. // Call mb_str_replace for each subject in array, recursively
  30. foreach ($subject as $key => $value) {
  31. $subject[$key] = mb_str_replace($search, $replace, $value, $count);
  32. }
  33. }
  34. return $subject;
  35. }
  36. /******************************************************************************
  37. * Copyright (c) 2010 Jevon Wright and others.
  38. * All rights reserved. This program and the accompanying materials
  39. * are made available under the terms of the Eclipse Public License v1.0
  40. * which accompanies this distribution, and is available at
  41. * http://www.eclipse.org/legal/epl-v10.html
  42. *
  43. * or
  44. *
  45. * LGPL which is available at http://www.gnu.org/licenses/lgpl.html
  46. *
  47. *
  48. * Contributors:
  49. * Jevon Wright - initial API and implementation
  50. * Denis Flaven - some fixes for properly handling UTF-8 characters
  51. ****************************************************************************/
  52. class Html2Text {
  53. /**
  54. * Tries to convert the given HTML into a plain text format - best suited for
  55. * e-mail display, etc.
  56. *
  57. * <p>In particular, it tries to maintain the following features:
  58. * <ul>
  59. * <li>Links are maintained, with the 'href' copied over
  60. * <li>Information in the &lt;head&gt; is lost
  61. * </ul>
  62. *
  63. * @param string html the input HTML
  64. * @return string the HTML converted, as best as possible, to text
  65. * @throws Html2TextException if the HTML could not be loaded as a {@link DOMDocument}
  66. */
  67. static function convert($html) {
  68. // replace &nbsp; with spaces
  69. $html = str_replace("&nbsp;", " ", $html);
  70. $html = mb_str_replace("\xa0", " ", $html); // DO NOT USE str_replace since it breaks the "à" character which is \xc3 \xa0 in UTF-8
  71. $html = static::fixNewlines($html);
  72. $doc = new \DOMDocument();
  73. if (!@$doc->loadHTML('<?xml encoding="UTF-8">'.$html)) // Forces the UTF-8 character set for HTML fragments
  74. {
  75. throw new Html2TextException("Could not load HTML - badly formed?", $html);
  76. }
  77. $output = static::iterateOverNode($doc);
  78. // remove leading and trailing spaces on each line
  79. $output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output);
  80. $output = preg_replace("/ *\t */im", "\t", $output);
  81. // remove unnecessary empty lines
  82. $output = preg_replace("/\n\n\n*/im", "\n\n", $output);
  83. // remove leading and trailing whitespace
  84. $output = trim($output);
  85. return $output;
  86. }
  87. /**
  88. * Unify newlines; in particular, \r\n becomes \n, and
  89. * then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
  90. * all become \ns.
  91. *
  92. * @param string text text with any number of \r, \r\n and \n combinations
  93. * @return string the fixed text
  94. */
  95. static function fixNewlines($text) {
  96. // replace \r\n to \n
  97. $text = str_replace("\r\n", "\n", $text);
  98. // remove \rs
  99. $text = str_replace("\r", "\n", $text);
  100. return $text;
  101. }
  102. static function nextChildName($node) {
  103. // get the next child
  104. $nextNode = $node->nextSibling;
  105. while ($nextNode != null) {
  106. if ($nextNode instanceof \DOMElement) {
  107. break;
  108. }
  109. $nextNode = $nextNode->nextSibling;
  110. }
  111. $nextName = null;
  112. if ($nextNode instanceof \DOMElement && $nextNode != null) {
  113. $nextName = strtolower($nextNode->nodeName);
  114. }
  115. return $nextName;
  116. }
  117. static function prevChildName($node) {
  118. // get the previous child
  119. $nextNode = $node->previousSibling;
  120. while ($nextNode != null) {
  121. if ($nextNode instanceof \DOMElement) {
  122. break;
  123. }
  124. $nextNode = $nextNode->previousSibling;
  125. }
  126. $nextName = null;
  127. if ($nextNode instanceof \DOMElement && $nextNode != null) {
  128. $nextName = strtolower($nextNode->nodeName);
  129. }
  130. return $nextName;
  131. }
  132. static function iterateOverNode($node) {
  133. if ($node instanceof \DOMText) {
  134. // Replace whitespace characters with a space (equivilant to \s)
  135. return preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
  136. }
  137. if ($node instanceof \DOMDocumentType) {
  138. // ignore
  139. return "";
  140. }
  141. $nextName = static::nextChildName($node);
  142. $prevName = static::prevChildName($node);
  143. $name = strtolower($node->nodeName);
  144. // start whitespace
  145. switch ($name) {
  146. case "hr":
  147. return "---------------------------------------------------------------\n";
  148. case "style":
  149. case "head":
  150. case "title":
  151. case "meta":
  152. case "script":
  153. // ignore these tags
  154. return "";
  155. case "h1":
  156. case "h2":
  157. case "h3":
  158. case "h4":
  159. case "h5":
  160. case "h6":
  161. case "ol":
  162. case "ul":
  163. // add two newlines, second line is added below
  164. $output = "\n";
  165. break;
  166. case "td":
  167. case "th":
  168. // add tab char to separate table fields
  169. $output = "\t";
  170. break;
  171. case "tr":
  172. case "p":
  173. case "div":
  174. // add one line
  175. $output = "\n";
  176. break;
  177. case "li":
  178. $output = "- ";
  179. break;
  180. default:
  181. // print out contents of unknown tags
  182. $output = "";
  183. break;
  184. }
  185. // debug
  186. //$output .= "[$name,$nextName]";
  187. if (isset($node->childNodes)) {
  188. for ($i = 0; $i < $node->childNodes->length; $i++) {
  189. $n = $node->childNodes->item($i);
  190. $text = static::iterateOverNode($n);
  191. $output .= $text;
  192. }
  193. }
  194. // end whitespace
  195. switch ($name) {
  196. case "h1":
  197. case "h2":
  198. case "h3":
  199. case "h4":
  200. case "h5":
  201. case "h6":
  202. $output .= "\n";
  203. break;
  204. case "p":
  205. case "br":
  206. // add one line
  207. if ($nextName != "div")
  208. $output .= "\n";
  209. break;
  210. case "div":
  211. // add one line only if the next child isn't a div
  212. if ($nextName != "div" && $nextName != null)
  213. $output .= "\n";
  214. break;
  215. case "a":
  216. // links are returned in [text](link) format
  217. $href = $node->getAttribute("href");
  218. $output = trim($output);
  219. // remove double [[ ]] s from linking images
  220. if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
  221. $output = substr($output, 1, strlen($output) - 2);
  222. // for linking images, the title of the <a> overrides the title of the <img>
  223. if ($node->getAttribute("title")) {
  224. $output = $node->getAttribute("title");
  225. }
  226. }
  227. // if there is no link text, but a title attr
  228. if (!$output && $node->getAttribute("title")) {
  229. $output = $node->getAttribute("title");
  230. }
  231. if ($href == null) {
  232. // it doesn't link anywhere
  233. if ($node->getAttribute("name") != null) {
  234. $output = "[$output]";
  235. }
  236. } else {
  237. if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") {
  238. // link to the same address: just use link
  239. $output;
  240. } else {
  241. // replace it
  242. if ($output) {
  243. $output = "[$output]($href)";
  244. } else {
  245. // empty string
  246. $output = $href;
  247. }
  248. }
  249. }
  250. // does the next node require additional whitespace?
  251. switch ($nextName) {
  252. case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
  253. $output .= "\n";
  254. break;
  255. }
  256. break;
  257. case "img":
  258. if ($node->getAttribute("title")) {
  259. $output = "[" . $node->getAttribute("title") . "]";
  260. } elseif ($node->getAttribute("alt")) {
  261. $output = "[" . $node->getAttribute("alt") . "]";
  262. } else {
  263. $output = "";
  264. }
  265. break;
  266. case "li":
  267. $output .= "\n";
  268. break;
  269. default:
  270. // do nothing
  271. }
  272. return $output;
  273. }
  274. }