Attribute.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700
  1. <?php
  2. namespace PicoFeed\Filter;
  3. use PicoFeed\Client\Url;
  4. /**
  5. * Attribute Filter class.
  6. *
  7. * @author Frederic Guillot
  8. */
  9. class Attribute
  10. {
  11. /**
  12. * Image proxy url.
  13. *
  14. * @var string
  15. */
  16. private $image_proxy_url = '';
  17. /**
  18. * Image proxy callback.
  19. *
  20. * @var \Closure|null
  21. */
  22. private $image_proxy_callback = null;
  23. /**
  24. * limits the image proxy usage to this protocol.
  25. *
  26. * @var string
  27. */
  28. private $image_proxy_limit_protocol = '';
  29. /**
  30. * Tags and attribute whitelist.
  31. *
  32. * @var array
  33. */
  34. private $attribute_whitelist = array(
  35. 'audio' => array('controls', 'src'),
  36. 'video' => array('poster', 'controls', 'height', 'width', 'src'),
  37. 'source' => array('src', 'type'),
  38. 'dt' => array(),
  39. 'dd' => array(),
  40. 'dl' => array(),
  41. 'table' => array(),
  42. 'caption' => array(),
  43. 'tr' => array(),
  44. 'th' => array(),
  45. 'td' => array(),
  46. 'tbody' => array(),
  47. 'thead' => array(),
  48. 'h1' => array(),
  49. 'h2' => array(),
  50. 'h3' => array(),
  51. 'h4' => array(),
  52. 'h5' => array(),
  53. 'h6' => array(),
  54. 'strong' => array(),
  55. 'em' => array(),
  56. 'code' => array(),
  57. 'pre' => array(),
  58. 'blockquote' => array(),
  59. 'p' => array(),
  60. 'ul' => array(),
  61. 'li' => array(),
  62. 'ol' => array(),
  63. 'br' => array(),
  64. 'del' => array(),
  65. 'a' => array('href'),
  66. 'img' => array('src', 'title', 'alt'),
  67. 'figure' => array(),
  68. 'figcaption' => array(),
  69. 'cite' => array(),
  70. 'time' => array('datetime'),
  71. 'abbr' => array('title'),
  72. 'iframe' => array('width', 'height', 'frameborder', 'src', 'allowfullscreen'),
  73. 'q' => array('cite'),
  74. );
  75. /**
  76. * Scheme whitelist.
  77. *
  78. * For a complete list go to http://en.wikipedia.org/wiki/URI_scheme
  79. *
  80. * @var array
  81. */
  82. private $scheme_whitelist = array(
  83. 'bitcoin:',
  84. 'callto:',
  85. 'ed2k://',
  86. 'facetime://',
  87. 'feed:',
  88. 'ftp://',
  89. 'geo:',
  90. 'git://',
  91. 'http://',
  92. 'https://',
  93. 'irc://',
  94. 'irc6://',
  95. 'ircs://',
  96. 'jabber:',
  97. 'magnet:',
  98. 'mailto:',
  99. 'nntp://',
  100. 'rtmp://',
  101. 'sftp://',
  102. 'sip:',
  103. 'sips:',
  104. 'skype:',
  105. 'smb://',
  106. 'sms:',
  107. 'spotify:',
  108. 'ssh:',
  109. 'steam:',
  110. 'svn://',
  111. 'tel:',
  112. );
  113. /**
  114. * Iframe source whitelist, everything else is ignored.
  115. *
  116. * @var array
  117. */
  118. private $iframe_whitelist = array(
  119. 'http://www.youtube.com',
  120. 'https://www.youtube.com',
  121. 'http://player.vimeo.com',
  122. 'https://player.vimeo.com',
  123. 'http://www.dailymotion.com',
  124. 'https://www.dailymotion.com',
  125. 'http://vk.com',
  126. 'https://vk.com',
  127. );
  128. /**
  129. * Blacklisted resources.
  130. *
  131. * @var array
  132. */
  133. private $media_blacklist = array(
  134. 'api.flattr.com',
  135. 'feeds.feedburner.com',
  136. 'share.feedsportal.com',
  137. 'da.feedsportal.com',
  138. 'rc.feedsportal.com',
  139. 'rss.feedsportal.com',
  140. 'res.feedsportal.com',
  141. 'res1.feedsportal.com',
  142. 'res2.feedsportal.com',
  143. 'res3.feedsportal.com',
  144. 'pi.feedsportal.com',
  145. 'rss.nytimes.com',
  146. 'feeds.wordpress.com',
  147. 'stats.wordpress.com',
  148. 'rss.cnn.com',
  149. 'twitter.com/home?status=',
  150. 'twitter.com/share',
  151. 'twitter_icon_large.png',
  152. 'www.facebook.com/sharer.php',
  153. 'facebook_icon_large.png',
  154. 'plus.google.com/share',
  155. 'www.gstatic.com/images/icons/gplus-16.png',
  156. 'www.gstatic.com/images/icons/gplus-32.png',
  157. 'www.gstatic.com/images/icons/gplus-64.png',
  158. );
  159. /**
  160. * Attributes used for external resources.
  161. *
  162. * @var array
  163. */
  164. private $media_attributes = array(
  165. 'src',
  166. 'href',
  167. 'poster',
  168. );
  169. /**
  170. * Attributes that must be integer.
  171. *
  172. * @var array
  173. */
  174. private $integer_attributes = array(
  175. 'width',
  176. 'height',
  177. 'frameborder',
  178. );
  179. /**
  180. * Mandatory attributes for specified tags.
  181. *
  182. * @var array
  183. */
  184. private $required_attributes = array(
  185. 'a' => array('href'),
  186. 'img' => array('src'),
  187. 'iframe' => array('src'),
  188. 'audio' => array('src'),
  189. 'source' => array('src'),
  190. );
  191. /**
  192. * Add attributes to specified tags.
  193. *
  194. * @var array
  195. */
  196. private $add_attributes = array(
  197. 'a' => array('rel' => 'noreferrer', 'target' => '_blank'),
  198. 'video' => array('controls' => 'true'),
  199. );
  200. /**
  201. * List of filters to apply.
  202. *
  203. * @var array
  204. */
  205. private $filters = array(
  206. 'filterAllowedAttribute',
  207. 'filterIntegerAttribute',
  208. 'rewriteAbsoluteUrl',
  209. 'filterIframeAttribute',
  210. 'filterBlacklistResourceAttribute',
  211. 'filterProtocolUrlAttribute',
  212. 'rewriteImageProxyUrl',
  213. 'secureIframeSrc',
  214. 'removeYouTubeAutoplay',
  215. );
  216. /**
  217. * Add attributes to specified tags.
  218. *
  219. * @var \PicoFeed\Client\Url
  220. */
  221. private $website;
  222. /**
  223. * Constructor.
  224. *
  225. * @param \PicoFeed\Client\Url $website Website url instance
  226. */
  227. public function __construct(Url $website)
  228. {
  229. $this->website = $website;
  230. }
  231. /**
  232. * Apply filters to the attributes list.
  233. *
  234. * @param string $tag Tag name
  235. * @param array $attributes Attributes dictionary
  236. *
  237. * @return array Filtered attributes
  238. */
  239. public function filter($tag, array $attributes)
  240. {
  241. foreach ($attributes as $attribute => &$value) {
  242. foreach ($this->filters as $filter) {
  243. if (!$this->$filter($tag, $attribute, $value)) {
  244. unset($attributes[$attribute]);
  245. break;
  246. }
  247. }
  248. }
  249. return $attributes;
  250. }
  251. /**
  252. * Return true if the value is allowed (remove not allowed attributes).
  253. *
  254. * @param string $tag Tag name
  255. * @param string $attribute Attribute name
  256. * @param string $value Attribute value
  257. *
  258. * @return bool
  259. */
  260. public function filterAllowedAttribute($tag, $attribute, $value)
  261. {
  262. return isset($this->attribute_whitelist[$tag]) && in_array($attribute, $this->attribute_whitelist[$tag]);
  263. }
  264. /**
  265. * Return true if the value is not integer (remove attributes that should have an integer value).
  266. *
  267. * @param string $tag Tag name
  268. * @param string $attribute Attribute name
  269. * @param string $value Attribute value
  270. *
  271. * @return bool
  272. */
  273. public function filterIntegerAttribute($tag, $attribute, $value)
  274. {
  275. if (in_array($attribute, $this->integer_attributes)) {
  276. return ctype_digit($value);
  277. }
  278. return true;
  279. }
  280. /**
  281. * Return true if the iframe source is allowed (remove not allowed iframe).
  282. *
  283. * @param string $tag Tag name
  284. * @param string $attribute Attribute name
  285. * @param string $value Attribute value
  286. *
  287. * @return bool
  288. */
  289. public function filterIframeAttribute($tag, $attribute, $value)
  290. {
  291. if ($tag === 'iframe' && $attribute === 'src') {
  292. foreach ($this->iframe_whitelist as $url) {
  293. if (strpos($value, $url) === 0) {
  294. return true;
  295. }
  296. }
  297. return false;
  298. }
  299. return true;
  300. }
  301. /**
  302. * Return true if the resource is not blacklisted (remove blacklisted resource attributes).
  303. *
  304. * @param string $tag Tag name
  305. * @param string $attribute Attribute name
  306. * @param string $value Attribute value
  307. *
  308. * @return bool
  309. */
  310. public function filterBlacklistResourceAttribute($tag, $attribute, $value)
  311. {
  312. if ($this->isResource($attribute) && $this->isBlacklistedMedia($value)) {
  313. return false;
  314. }
  315. return true;
  316. }
  317. /**
  318. * Convert all relative links to absolute url.
  319. *
  320. * @param string $tag Tag name
  321. * @param string $attribute Attribute name
  322. * @param string $value Attribute value
  323. *
  324. * @return bool
  325. */
  326. public function rewriteAbsoluteUrl($tag, $attribute, &$value)
  327. {
  328. if ($this->isResource($attribute)) {
  329. $value = Url::resolve($value, $this->website);
  330. }
  331. return true;
  332. }
  333. /**
  334. * Turns iframes' src attribute from http to https to prevent
  335. * mixed active content.
  336. *
  337. * @param string $tag Tag name
  338. * @param array $attribute Atttributes name
  339. * @param string $value Attribute value
  340. *
  341. * @return bool
  342. */
  343. public function secureIframeSrc($tag, $attribute, &$value)
  344. {
  345. if ($tag === 'iframe' && $attribute === 'src' && strpos($value, 'http://') === 0) {
  346. $value = substr_replace($value, 's', 4, 0);
  347. }
  348. return true;
  349. }
  350. /**
  351. * Removes YouTube autoplay from iframes.
  352. *
  353. * @param string $tag Tag name
  354. * @param array $attribute Atttributes name
  355. * @param string $value Attribute value
  356. *
  357. * @return bool
  358. */
  359. public function removeYouTubeAutoplay($tag, $attribute, &$value)
  360. {
  361. $regex = '%^(https://(?:www\.)?youtube.com/.*\?.*autoplay=)(1)(.*)%i';
  362. if ($tag === 'iframe' && $attribute === 'src' && preg_match($regex, $value)) {
  363. $value = preg_replace($regex, '${1}0$3', $value);
  364. }
  365. return true;
  366. }
  367. /**
  368. * Rewrite image url to use with a proxy.
  369. *
  370. * @param string $tag Tag name
  371. * @param string $attribute Attribute name
  372. * @param string $value Attribute value
  373. *
  374. * @return bool
  375. */
  376. public function rewriteImageProxyUrl($tag, $attribute, &$value)
  377. {
  378. if ($tag === 'img' && $attribute === 'src'
  379. && !($this->image_proxy_limit_protocol !== '' && stripos($value, $this->image_proxy_limit_protocol.':') !== 0)) {
  380. if ($this->image_proxy_url) {
  381. $value = sprintf($this->image_proxy_url, rawurlencode($value));
  382. } elseif (is_callable($this->image_proxy_callback)) {
  383. $value = call_user_func($this->image_proxy_callback, $value);
  384. }
  385. }
  386. return true;
  387. }
  388. /**
  389. * Return true if the scheme is authorized.
  390. *
  391. * @param string $tag Tag name
  392. * @param string $attribute Attribute name
  393. * @param string $value Attribute value
  394. *
  395. * @return bool
  396. */
  397. public function filterProtocolUrlAttribute($tag, $attribute, $value)
  398. {
  399. if ($this->isResource($attribute) && !$this->isAllowedProtocol($value)) {
  400. return false;
  401. }
  402. return true;
  403. }
  404. /**
  405. * Automatically add/override some attributes for specific tags.
  406. *
  407. * @param string $tag Tag name
  408. * @param array $attributes Attributes list
  409. *
  410. * @return array
  411. */
  412. public function addAttributes($tag, array $attributes)
  413. {
  414. if (isset($this->add_attributes[$tag])) {
  415. $attributes += $this->add_attributes[$tag];
  416. }
  417. return $attributes;
  418. }
  419. /**
  420. * Return true if all required attributes are present.
  421. *
  422. * @param string $tag Tag name
  423. * @param array $attributes Attributes list
  424. *
  425. * @return bool
  426. */
  427. public function hasRequiredAttributes($tag, array $attributes)
  428. {
  429. if (isset($this->required_attributes[$tag])) {
  430. foreach ($this->required_attributes[$tag] as $attribute) {
  431. if (!isset($attributes[$attribute])) {
  432. return false;
  433. }
  434. }
  435. }
  436. return true;
  437. }
  438. /**
  439. * Check if an attribute name is an external resource.
  440. *
  441. * @param string $attribute Attribute name
  442. *
  443. * @return bool
  444. */
  445. public function isResource($attribute)
  446. {
  447. return in_array($attribute, $this->media_attributes);
  448. }
  449. /**
  450. * Detect if the protocol is allowed or not.
  451. *
  452. * @param string $value Attribute value
  453. *
  454. * @return bool
  455. */
  456. public function isAllowedProtocol($value)
  457. {
  458. foreach ($this->scheme_whitelist as $protocol) {
  459. if (strpos($value, $protocol) === 0) {
  460. return true;
  461. }
  462. }
  463. return false;
  464. }
  465. /**
  466. * Detect if an url is blacklisted.
  467. *
  468. * @param string $resource Attribute value (URL)
  469. *
  470. * @return bool
  471. */
  472. public function isBlacklistedMedia($resource)
  473. {
  474. foreach ($this->media_blacklist as $name) {
  475. if (strpos($resource, $name) !== false) {
  476. return true;
  477. }
  478. }
  479. return false;
  480. }
  481. /**
  482. * Convert the attribute list to html.
  483. *
  484. * @param array $attributes Attributes
  485. *
  486. * @return string
  487. */
  488. public function toHtml(array $attributes)
  489. {
  490. $html = array();
  491. foreach ($attributes as $attribute => $value) {
  492. $html[] = sprintf('%s="%s"', $attribute, Filter::escape($value));
  493. }
  494. return implode(' ', $html);
  495. }
  496. /**
  497. * Set whitelisted tags and attributes for each tag.
  498. *
  499. * @param array $values List of tags: ['video' => ['src', 'cover'], 'img' => ['src']]
  500. *
  501. * @return Attribute
  502. */
  503. public function setWhitelistedAttributes(array $values)
  504. {
  505. $this->attribute_whitelist = $values ?: $this->attribute_whitelist;
  506. return $this;
  507. }
  508. /**
  509. * Set scheme whitelist.
  510. *
  511. * @param array $values List of scheme: ['http://', 'ftp://']
  512. *
  513. * @return Attribute
  514. */
  515. public function setSchemeWhitelist(array $values)
  516. {
  517. $this->scheme_whitelist = $values ?: $this->scheme_whitelist;
  518. return $this;
  519. }
  520. /**
  521. * Set media attributes (used to load external resources).
  522. *
  523. * @param array $values List of values: ['src', 'href']
  524. *
  525. * @return Attribute
  526. */
  527. public function setMediaAttributes(array $values)
  528. {
  529. $this->media_attributes = $values ?: $this->media_attributes;
  530. return $this;
  531. }
  532. /**
  533. * Set blacklisted external resources.
  534. *
  535. * @param array $values List of tags: ['http://google.com/', '...']
  536. *
  537. * @return Attribute
  538. */
  539. public function setMediaBlacklist(array $values)
  540. {
  541. $this->media_blacklist = $values ?: $this->media_blacklist;
  542. return $this;
  543. }
  544. /**
  545. * Set mandatory attributes for whitelisted tags.
  546. *
  547. * @param array $values List of tags: ['img' => 'src']
  548. *
  549. * @return Attribute
  550. */
  551. public function setRequiredAttributes(array $values)
  552. {
  553. $this->required_attributes = $values ?: $this->required_attributes;
  554. return $this;
  555. }
  556. /**
  557. * Set attributes to automatically to specific tags.
  558. *
  559. * @param array $values List of tags: ['a' => 'target="_blank"']
  560. *
  561. * @return Attribute
  562. */
  563. public function setAttributeOverrides(array $values)
  564. {
  565. $this->add_attributes = $values ?: $this->add_attributes;
  566. return $this;
  567. }
  568. /**
  569. * Set attributes that must be an integer.
  570. *
  571. * @param array $values List of tags: ['width', 'height']
  572. *
  573. * @return Attribute
  574. */
  575. public function setIntegerAttributes(array $values)
  576. {
  577. $this->integer_attributes = $values ?: $this->integer_attributes;
  578. return $this;
  579. }
  580. /**
  581. * Set allowed iframe resources.
  582. *
  583. * @param array $values List of tags: ['http://www.youtube.com']
  584. *
  585. * @return Attribute
  586. */
  587. public function setIframeWhitelist(array $values)
  588. {
  589. $this->iframe_whitelist = $values ?: $this->iframe_whitelist;
  590. return $this;
  591. }
  592. /**
  593. * Set image proxy URL.
  594. *
  595. * The original image url will be urlencoded
  596. *
  597. * @param string $url Proxy URL
  598. *
  599. * @return Attribute
  600. */
  601. public function setImageProxyUrl($url)
  602. {
  603. $this->image_proxy_url = $url ?: $this->image_proxy_url;
  604. return $this;
  605. }
  606. /**
  607. * Set image proxy callback.
  608. *
  609. * @param \Closure $callback
  610. *
  611. * @return Attribute
  612. */
  613. public function setImageProxyCallback($callback)
  614. {
  615. $this->image_proxy_callback = $callback ?: $this->image_proxy_callback;
  616. return $this;
  617. }
  618. /**
  619. * Set image proxy protocol restriction.
  620. *
  621. * @param string $value
  622. *
  623. * @return Attribute
  624. */
  625. public function setImageProxyProtocol($value)
  626. {
  627. $this->image_proxy_limit_protocol = $value ?: $this->image_proxy_limit_protocol;
  628. return $this;
  629. }
  630. }