<?php
 
/**
 
* Class for getting general informations about html content
 
* @author    Sven Wagener <wagener_at_indot_dot_de>
 
* @include      Funktion:_include_
 
*/
 
class html_info{
 
    
 
    var $string="";
 
    var $meta="";
 
    
 
    
 
    /**
 
    * Constructor of class html_info
 
    * @param string $html_string The whole HTML document as String
 
    * @desc Constructor of class html_info
 
    */    
 
    function html_info($html_string){
 
        $this->string=$html_string;
 
    }
 
    
 
    /**
 
    * Returns the title
 
    * @return string $title the title of the HTML document
 
    * @desc Constructor of class html_info
 
    */        
 
    function get_title(){
 
        $string=strtolower($this->string);
 
        preg_match_all("|<title>(.*)</title>|U",$string,$matches, PREG_PATTERN_ORDER);
 
        
 
        return $matches[1][0];
 
    }
 
    
 
    /**
 
    * Returns the meta data
 
    * @return array $matches the title of the HTML document
 
    * @desc Returns the meta data of the HTML document in an array ($matches[$i]['name'] and $matches[$i]['content'])
 
    */        
 
    function get_meta_data(){
 
        $string=strtolower($this->string);
 
        preg_match_all("|<meta (.*)>|U",$string,$matches, PREG_PATTERN_ORDER);
 
        
 
        $k=0;
 
        $tmp_match_array="";
 
        
 
        // Putting all matches in an array
 
        for($i=0;$i<count($matches);$i++){
 
            for($j=0;$j<count($matches[$i]);$j++){
 
                if($matches[$i][$j]!=""){
 
                    $tmp_match_array[$k]=$matches[$i][$j];
 
                    $k++;
 
                }
 
            }
 
        }
 
        
 
        $matches="";
 
        
 
        // Getting detailed information of meta data and putting in array
 
        $k=0;
 
        for($i=0;$i<count($tmp_match_array);$i++){
 
            
 
            // Getting name
 
            preg_match_all("|name\=\"(.*)\" |U",$tmp_match_array[$i],$name_matches, PREG_PATTERN_ORDER);
 
            // Checking if entry not exists
 
            $found=false;
 
            for($j=0;$j<count($matches);$j++){
 
                if($matches[$j]['name']==$name_matches[1][0]){
 
                    $found=true;
 
                }
 
            }
 
            if(!$found && $name_matches[1][0]!=""){
 
                $matches[$k]['name']=$name_matches[1][0];
 
                
 
                // Getting content
 
                preg_match_all("|content\=\"(.*)\"|U",$tmp_match_array[$i],$content_matches, PREG_PATTERN_ORDER);
 
                $matches[$k]['content']=$content_matches[1][0];
 
                $k++;
 
            }
 
        }
 
        
 
        $this->meta=$matches;
 
        return $matches;
 
    }
 
    
 
    /**
 
    * Returns all images
 
    * @return array $match the pictures and all information in an array
 
    * @desc Returns all images in an array ($match[$i]['src'], $match[$i]['alt'], $match[$i]['width'] and $match[$i]['height'])
 
    */        
 
    function get_images(){
 
        $string=strtolower($this->string);
 
        preg_match_all("|<img (.*)>|U",$string,$matches, PREG_PATTERN_ORDER);
 
        
 
        // Putting all matches in an array
 
        for($i=0;$i<count($matches);$i++){
 
            for($j=0;$j<count($matches[$i]);$j++){
 
                if($matches[$i][$j]!=""){
 
                    $tmp_match_array[$k]=$matches[$i][$j];
 
                    $k++;
 
                }
 
            }
 
        }
 
        $k=0;
 
        for($i=0;$i<count($tmp_match_array);$i++){
 
            $found=false;
 
            for($j=0;$j<count($match);$j++){
 
                if($this->get_tag_param("src",$tmp_match_array[$i])==$match[$j]['src']){
 
                    $found=true;
 
                }
 
            }
 
            if(!$found && $this->get_tag_param("src",$tmp_match_array[$i])!=""){
 
                $match[$k]['src']=$this->get_tag_param("src",$tmp_match_array[$i]);
 
                $match[$k]['alt']=$this->get_tag_param("alt",$tmp_match_array[$i]);
 
                $match[$k]['width']=$this->get_tag_param("width",$tmp_match_array[$i]);
 
                $match[$k]['height']=$this->get_tag_param("height",$tmp_match_array[$i]);
 
                $k++;
 
            }
 
        }
 
        
 
        return $match;
 
    }
 
    
 
    /**
 
    * Returns all links
 
    * @return array $match the links and all information in an array
 
    * @desc Returns all links in an array ($match[$i]['href'] and $match[$i]['target'])
 
    */        
 
    function get_links(){
 
        $string=strtolower($this->string);
 
        preg_match_all("|<a (.*)>|U",$string,$matches, PREG_PATTERN_ORDER);
 
        
 
        // Putting all matches in an array
 
        for($i=0;$i<count($matches);$i++){
 
            for($j=0;$j<count($matches[$i]);$j++){
 
                if($matches[$i][$j]!=""){
 
                    $tmp_match_array[$k]=$matches[$i][$j];
 
                    // echo $tmp_match_array[$k]."<br>\n";
 
                    $k++;
 
                }
 
            }
 
        }
 
        
 
        $k=0;
 
        for($i=0;$i<count($tmp_match_array);$i++){
 
            $found=false;
 
            for($j=0;$j<count($match);$j++){
 
                if($this->get_tag_param("href",$tmp_match_array[$i])==$match[$j]['href']){
 
                    $found=true;
 
                }
 
            }
 
            if(!$found && $this->get_tag_param("href",$tmp_match_array[$i])!=""){
 
                $match[$k]['href']=$this->get_tag_param("href",$tmp_match_array[$i]);
 
                $match[$k]['target']=$this->get_tag_param("target",$tmp_match_array[$i]);
 
                $k++;
 
            }
 
        }
 
        
 
        return $match;
 
    }
 
    
 
    /**
 
    * Returns all strings which are formated like the given parameter
 
    * @param boolean $bold if string have to be formatted bold choose true
 
    * @param boolean $italic if string have to be formatted italic choose true
 
    * @param boolean $underlined if string have to be formatted underlined choose true
 
    * @return array $strings the strings which have been found in an array
 
    * @desc Returns all strings in an array which are formated like the given parameter
 
    */            
 
    function get_strings_formated($bold,$italic,$underlined){
 
        $i=0;
 
        if($bold){
 
            $tags[$i]['open']="<b>";
 
            $tags[$i]['close']="</b>";
 
            $i++;
 
        }
 
        if($italic){
 
            $tags[$i]['open']="<i>";
 
            $tags[$i]['close']="</i>";
 
            $i++;
 
        }
 
        if($underlined){
 
            $tags[$i]['open']="<u>";
 
            $tags[$i]['close']="</u>";
 
            $i++;
 
        }
 
        
 
        $strings=$this->get_strings_in_tags($tags,$this->string);
 
        
 
        return $strings;
 
    }
 
    
 
    /**
 
    * Returns all strings in $string which are given to the parameter $tags
 
    * @param array $tags the tags in an array ($tags[$i]['open'] and $tags[$i]['close'])
 
    * @param string $string the HTML string
 
    * @return array $strings the strings which have been found in an array
 
    * @desc Returns all strings in $string which are given to the parameter $tags
 
    */        
 
    function get_strings_in_tags($tags,$string){
 
        for($i=0;$i<count($tags);$i++){
 
            $k=0;
 
            $pattern="|".$tags[$i]['open']."(.*)".$tags[$i]['close']."|U";
 
            preg_match_all($pattern,$string,$matches, PREG_PATTERN_ORDER);
 
            
 
            // Getting rest of all Tags
 
            for($j=0;$j<count($tags);$j++){
 
                if($tags[$j]['open']!=$tags[$i]['open'] && $tags[$j]['close']!=$tags[$i]['close']){
 
                    $new_tags[$k]=$tags[$j];
 
                    $k++;
 
                }
 
            }
 
            // Getting Strings from all matches
 
            for($j=0;$j<count($matches[1]);$j++){
 
                $new_string=$matches[1][$j];
 
            }
 
            
 
            if(count($tags)==1){
 
                for($j=0;$j<count($matches[1]);$j++){
 
                    $end_matches[$j]=strip_tags($matches[1][$j]);
 
                }
 
                return $end_matches;
 
            }else{
 
                for($j=0;$j<count($matches[1]);$j++){
 
                    $new_string=$matches[1][$j];
 
                    $end_matches=array_merge($this->get_strings_in_tags($new_tags,$new_string),$end_matches);
 
                }
 
            }
 
        }
 
        return $end_matches;
 
    }
 
    
 
    /**
 
    * Returns all strings in $string which are between the start and end tag
 
    * @param string $start_tag the starting tag
 
    * @param string $end_tag the end tag
 
    * @param string $string the string to search for
 
    * @return array $strings the strings which have been found pusched in an array
 
    * @desc Returns all strings in $string which are between the start and end tag
 
    */    
 
    function get_strings_in_tag($start_tag,$end_tag,$string){
 
        $pattern="|".$start_tag."(.*)".$end_tag."|U";
 
        preg_match_all($pattern,$string,$matches, PREG_PATTERN_ORDER);
 
        for($j=0;$j<count($matches[1]);$j++){
 
            $array[$j]=$matches[1][$j];
 
        }
 
        return $array;
 
    }
 
    
 
    /**
 
    * Returns all strings which are headed (<h1> ... </h1> etc) 
 
    * @param int $from_headnumber
 
    * @param int $till_headnumber
 
    * @return array $strings the strings which have been found pusched in an array
 
    * @desc Returns all strings which are headed (<h1> ... </h1> etc) 
 
    */        
 
    function get_strings_headed($from_headnumber,$till_headnumber){
 
        $count_headers=$till_headnumber-$from_headnumber;
 
        $result_arr=array();
 
        
 
        for($i=$from_headnumber;$i<=$till_headnumber;$i++){
 
            $results=$this->get_strings_in_tag("<h$i>","</h$i>",$this->string);
 
             if($results!=""){
 
                $result_arr=array_merge($result_arr,$results);
 
            }
 
        }
 
        return $result_arr;
 
    }
 
 
    /**
 
    * Returns the content of the body
 
    * @return string $bodytext The content of the body
 
    * @desc Returns the content of the body
 
    */    
 
    function get_body(){
 
        // Getting body parametres
 
        $pattern="|<body(.*)>|U";
 
        preg_match_all($pattern,$string,$matches, PREG_PATTERN_ORDER);        
 
                
 
        // Deleting body parameters
 
        $string=str_replace($matches[1][0],"",$string);
 
        echo "<xmp>".$string."</xmp>";
 
        $pattern="|<body>(.*)</body>|U";
 
        
 
        // Getting text in body
 
        $matches="";
 
        preg_match_all($pattern,$string,$matches, PREG_SET_ORDER);        
 
        $string=$matches;
 
 
        for($i=0;$i<count($string);$i++){
 
            for($j=0;$j<count($string[$i]);$j++){
 
                echo "\$string[$i][$j]".$string[$i][$j]."<br>";    
 
            }
 
        }        
 
    }
 
    
 
    /**
 
    * Returns the content of the body without tags
 
    * @return string $bodytext the content of the body without tags
 
    * @desc Returns the content of the body without tags
 
    */    
 
    function get_body_text(){
 
        $string=$this->string;    
 
 
        $string=strip_tags($string);
 
        $string=str_replace("\n","",$string);
 
        $string=str_replace("\r","",$string);
 
        $string=str_replace("\t","",$string);
 
        $string=str_replace("<!--","",$string);
 
        $string=str_replace("//-->","",$string);
 
        $string=str_replace(" ","",$string);
 
        
 
        return $string;
 
    }
 
 
    /**
 
    * Returns the frame urls
 
    * @return array $frame_urls the urls of the frame in an array
 
    * @desc Returns the frame urls
 
    */            
 
    function get_frame_urls(){
 
    }
 
    
 
    function get_tag_param($param,$tag){
 
        preg_match_all("|$param\=\"(.*)\"|U",$tag,$matches, PREG_PATTERN_ORDER);
 
        if($matches[1][0]==""){
 
            preg_match_all("|$param\=(.*)|U",$tag,$matches, PREG_PATTERN_ORDER);
 
        }
 
        if($matches[1][0]==""){
 
            preg_match_all("|$param\=\'(.*)\'|U",$tag,$matches, PREG_PATTERN_ORDER);
 
        }
 
        return $matches[1][0];
 
    }
 
}
 
?>
 
 |