Tag content retrieval from websites with preg_match

arnage

Moderator

Join Date: May 2009

Posts: 817
#1

Tag content retrieval from websites with preg_match

22.03.12, 19:42

Interesting article from the net.
These functions allow to get any tag from a website, read the contents and return an array with a) the contents b) the count of occurrences. Furthermore added: retrieve mails, get doctype, keywords, metas, link rel's, p tags, h tags, comments, inline used classes and id's, title tags, images, alt desc of images, separate internal links from external links and much more.

tagretrieval.php

PHP Code:

<?php /* Functions for retrieving web document tags */ /* written by artViper designstudio ©2007 all rights reserved */ /* this function list is listed under the GPL */ /* if you use this, please honor our work and name us on your page */ /* if you have further questions, enhancements or anything else */ /* then drop a line at admin@artviper.net */ /* most functions return the content of the requested tags in array[0] */ /* and the count in array[1] except those, where a special function to */ /* retrieve the count is given */ /* example usage : $file = file_get_contents("http://www.artviper.com"); $x = (get_link_rel($file)); print_r($x); */ // retrieve doctype of document function get_doctype($file){ $h1tags = preg_match('/<!DOCTYPE (\w.*)dtd">/is',$file,$patterns); $res = array(); array_push($res,$patterns[0]); array_push($res,count($patterns[0])); return $res; } // retrieve page title function get_doc_title($file){ $h1tags = preg_match('/<title> ?.* <\/title>/isx',$file,$patterns); $res = array(); array_push($res,$patterns[0]); array_push($res,count($patterns[0])); return $res; } // retrieve keywords function get_keywords($file){ $h1tags = preg_match('/(<meta name="keywords" content="(.*)" \/>)/i',$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // get rel links in header of the site function get_link_rel($file){ $h1tags = preg_match_all('/(rel=)(".*") href=(".*")/im',$file,$patterns); $res = array(); array_push($res,$patterns); array_push($res,count($patterns[2])); return $res; } function get_external_css($file){ $h1tags = preg_match_all('/(href=")(\w.*\.css)"/i',$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // retrieve all h1 tags function get_h1($file){ $h1tags = preg_match_all("/(<h1.*>)(\w.*)(<\/h1>)/isxmU",$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // retrieve all h2 tags function get_h2($file){ $h1tags = preg_match_all("/(<h2.*>)(\w.*)(<\/h2>)/isxmU",$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // retrieve all h3 tags function get_h3($file){ $h1tags = preg_match_all("/(<h3.*>)(\w.*)(<\/h3>)/ismU",$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // retrieve all h4 tags function get_h4($file){ $h1tags = preg_match_all("/(<h4.*>)(\w.*)(<\/h4>)/ismU",$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // retrieve all h5 tags function get_h5($file){ $h1tags = preg_match_all("/(<h5.*>)(\w.*)(<\/h5>)/ismU",$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // retrieve all h5 tags function get_h6($file){ $h1tags = preg_match_all("/(<h6.*>)(\w.*)(<\/h6>)/ismU",$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // retrieve p tag contents function get_p($file){ $h1tags = preg_match_all("/(<p.*>)(\w.*)(<\/p>)/ismU",$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // retrieve names of links function get_a_content($file){ $h1count = preg_match_all("/(<a.*>)(\w.*)(<.*>)/ismU",$file,$patterns); return $patterns[2]; } // retrieve link destinations function get_a_href($file){ $h1count = preg_match_all('/(href=")(.*?)(")/i',$file,$patterns); return $patterns[2]; } // get count of href's function get_a_href_count($file){ $h1count = preg_match_all('/<(a.*) href=\"(.*?)\"(.*)<\/a>/',$file,$patterns); return count($patterns[0]); } //get all additional tags inside a link tag function get_a_additionaltags($file){ $h1count = preg_match_all('/<(a.*) href="(.*?)"(.*)>(.*)(<\/a>)/',$file,$patterns); return $patterns[3]; } // retrieve span's function get_span($file){ $h1count = preg_match_all('/(<span .*>)(.*)(<\/span>)/',$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // retrieve spans on the site function get_script($file){ $h1count = preg_match_all('/(<script.*>)(.*)(<\/script>)/imxsU',$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // retrieve content of ul's function get_ul($file){ $h1count = preg_match_all('/(<ul \w*>)(.*)(<\/ul>)/ismxU',$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } //retrieve li contents function get_li($file){ $h1count = preg_match_all('/(<li \w*>)(.*)(<\/li>)/ismxU',$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // retrieve page comments function get_comments($file){ $h1count = preg_match_all('/()/isU',$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // retrieve all used id's on the page function get_ids($file){ $h1count = preg_match_all('/(id="(\w*)")/is',$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // retrieve all used classes ( inline ) of the document function get_classes($file){ $h1count = preg_match_all('/(class="(\w*)")/is',$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // get the meta tag contents function get_meta_content($file){ $h1count = preg_match_all('/(<meta)(.*="(.*)").\/>/ix',$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // get inline styles function get_styles($file){ $h1count = preg_match_all('/(style=")(.*?)(")/is',$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // get titles of tags function get_tag_titles($file){ $h1count = preg_match_all('/(title=)"(.*)"(.*)/',$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // get image alt descriptions function get_image_alt($file){ $h1count = preg_match_all('/(alt=.)([a-zA-Z0-9\s]{1,})/',$file,$patterns); $res = array(); array_push($res,$patterns[2]); array_push($res,count($patterns[2])); return $res; } // retrieve images on the site function get_images($file){ $h1count = preg_match_all('/(<img)\s (src="([a-zA-Z0-9\.;:\/\?&=_|\r|\n]{1,})")/isxmU',$file,$patterns); $res = array(); array_push($res,$patterns[3]); array_push($res,count($patterns[3])); return $res; } // retrieve email address of the mailto tag if any function get_mailto($file){ $h1count = preg_match_all('/(<a\shref=")(mailto:)([a-zA-Z@0-9\.]{1,})"/ims',$file,$patterns); $res = array(); array_push($res,$patterns[3]); array_push($res,count($patterns[3])); return $res; } // retrieve any email function get_emails($file){ $h1count = preg_match_all('/[a-zA-Z0-9_-]{1,}@[a-zA-Z0-9-_]{1,}\.[a-zA-Z]{1,4}/',$file,$patterns); $res = array(); array_push($res,$patterns[0]); array_push($res,count($patterns[0])); return $res; } // count used keywords function countkeyword($word,$file){ $x = preg_match_all("/(.*)($word)(.*)/",$file,$patterns); return count($patterns); } // retrieve internal site links function get_internal_links($array){ $result = array(); $count = count($array); for($i=0;$i<$count;$i++){ if(!empty($array[$i])){ if(strpos($array[$i],"www",0) === false){ if(strpos($array[$i],"http",0) === false){ array_push($result,$array[$i]); } } } } return $result; } // retrieve external links function get_external_links($array){ $result = array(); $count = count($array); for($i=0;$i<$count;$i++){ if(!empty($array[$i])){ if(strpos($array[$i],"www",0) !== false){ if(strpos($array[$i],"http",0) !== false){ array_push($result,$array[$i]); } } } } return $result; } // retrieve the main url of the site function get_main_url($url){ $parts = parse_url($url); $url = $parts["scheme"] ."://".$parts["host"]; return $url; } // retrieve just the name without www and com/eu/de etc function get_domain_name_only($url){ $match = preg_match("/(.*:\/\/)\w{0,}(.*)\.(.*)/",$url,$patterns); $patterns[2] = str_replace(".","",$patterns[2]); return $patterns[2]; } ?>

Usage Example

PHP Code:

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>Untitled Document</title> <style type="text/css">  </style> </head> <body><form action="" method="post" enctype="application/x-www-form-urlencoded" name="form1"> <input name="url" type="text" /><input name="submit" type="submit" value="submit" /></form> <?php if(isset($_POST['submit']) && $_POST['submit'] == "submit"){ $resultArray = array(); $url = htmlentities(strip_tags($_POST['url'])); include("tagretrieval.php"); if(strpos($url,"http",0) === false){ $url = "http://$url"; } echo "<img src=\"http://www.artviper.net/screenshots/screener.php?url=$url&h=180&w=240&sdx=1024&sdy=768\" alt=\"$url\" style=\"margin:20px\" />"; $file = file_get_contents($url); $doctype = get_doctype($file); $keywords = get_keywords($file); $css = get_external_css($file); $h1 = get_h1($file); $h2 = get_h2($file); $h3 = get_h3($file); $p = get_p($file); $title = get_doc_title($file); $links = get_a_href($file); $href_add = get_a_additionaltags($file); $images = get_images($file); $styles = get_styles($file); $ids = get_ids($file); $classes = get_classes($file); echo"<h1>Document properties</h1>"; // get doctype if(!empty($doctype[0])){ $doctype = preg_replace("/</",'<',$doctype[0]); echo "<br/>Valid doctype: $doctype"; }else{ $doctype = "<br/>No doctype specified<br/>"; echo $doctype; } // get doc title if(!empty($title[0])){ $title[0] = preg_replace("/</","<",$title[0]); echo "<br/>Title found: $title[0]<"; }else{ echo"<br/><div class=\"error\">Page does not have a title</div><br/>"; } // get external references CSS files // get h1 tags if($css[1] != 0){ echo "<br/>external CSS found: $css[1]<ul>"; foreach($css[0] as $key => $val){ echo "<li>" . htmlentities($val) . "</li>"; } echo "</ul>"; }else{ echo "<br/><div class=\"error\">No external CSS found</div><br/>"; } // get keywords if(!empty($keywords[0])){ echo"<br/>Keywords found: $keywords[0]"; }else{ echo "<br/><div class=\"error\">No keywords specified</div><br/>"; } echo"<h1>Content properties</h1>"; // get h1 tags if($h1[1] != 0){ echo "<br/>H1 Tags found: $h1[1]<ul>"; foreach($h1[0] as $key => $val){ echo "<li>" . htmlentities($val) . "</li>"; } echo "</ul>"; }else{ echo "<br/><div class=\"error\">No H1 Tags found</div><br/>"; } // get h2 tags if($h2[1] != 0){ echo "<br/>H2 Tags found: $h2[1]<ul>"; foreach($h2[0] as $key => $val){ echo "<li>" . htmlentities($val) . "</li>"; } echo "</ul>"; }else{ echo "<br/><div class=\"error\">No H2 Tags found</div><br/>"; } // get h3 tags if($h3[1] != 0){ echo "<br/>H3 Tags found: $h3[1]<ul>"; foreach($h3[0] as $key => $val){ echo "<li>" . htmlentities($val) . "</li>"; } echo "</ul>"; }else{ echo "<br/><div class=\"error\">No H3 Tags found</div><br/>"; } // get p tags if($p[1] != 0){ echo "<br/>p Tags found: $p[1]<ul>"; foreach($p[0] as $key => $val){ echo "<li>" . htmlentities($val) . "</li>"; } echo "</ul>"; }else{ echo "<br/><div class=\"error\">No p Tags found</div><br/>"; } echo "<h1>Link structure</h1>"; if(!empty($links[0])){ echo "<br/>Links found:<ul>"; foreach($links as $key => $val){ $val = preg_replace("/</","<",$val); echo "<li>" . htmlentities($val) . "</li>"; } echo "</ul>"; }else{ echo "<br/><div class=\"error\">No Links found</div><br/>"; } if(!empty($href_add[0])){ echo "<br/>href additional tags:<ul>"; foreach($href_add as $key => $val){ $val = preg_replace("/</","<",$val); echo "<li>" . htmlentities($val) . "</li>"; } echo "</ul>"; }else{ echo "<br/><div class=\"error\">No additional styles for href found</div><br/>"; } echo "<h1>Images</h1>"; if(!empty($images[0])){ echo "<br/>images:<ul>"; foreach($images[0] as $key => $val){ $val = preg_replace("/</","<",$val); echo "<li>" . htmlentities($val) . "</li>"; } echo "</ul>"; }else{ echo "<br/>No images found"; } echo "<h1>Styles, ID's & Classes</h1>"; if(!empty($ids[0])){ echo "<br/>ID's:<ul>"; foreach($ids[0] as $key => $val){ $val = preg_replace("/</","<",$val); echo "<li>" . htmlentities($val) . "</li>"; } echo "</ul>"; }else{ echo "<br/>No ID's found<br/>"; } if(!empty($classes[0])){ echo "<br/>classes:<ul>"; foreach($classes[0] as $key => $val){ $val = preg_replace("/</","<",$val); echo "<li>" . htmlentities($val) . "</li>"; } echo "</ul>"; }else{ echo "<br/>No classes found<br/>"; } if(!empty($styles[0])){ echo "<br/>inline styles:<ul>"; foreach($styles[0] as $key => $val){ $val = preg_replace("/</","<",$val); echo "<li>" . htmlentities($val) . "</li>"; } echo "<div class=\"notice\">Your document uses inline styles. If applicable, try to put them into a separate CSS file and restyle them to ID's or CLASSES.</div></ul>"; }else{ echo "<br/>No inline styles used"; } } ?> </body> </html>

Code Examples -> Tag content retrieval from websites with preg_match

<!DOCTYPE html PUBLIC "-//WAPFORUM.RS
Tags: None

Previous template Next