I have taken an example of one website like how we can scrap its page content programmatically using simple html dom script.

NOTE: Download simplehtmldom script from here and include it:

<?php 
include_once('../simple_html_dom.php');

//start scrapping html
    $html = file_get_html('https://manuals.plus/ajax/000165-wireless-panic-button-and-remote-control-manual');

    //$mysiteUrl = 'http://localhost/scrap/simplehtmldom/example/';
    $mysiteUrl = 'http://localhost/manualsplus/';
    
    $prefix='';

    foreach($html->find('img') as $content) {
        
        foreach ($content->getAllAttributes() as $attr => $val) {
            if($attr !='src' && $attr !='width' && $attr !='height' && $attr !='alt'){
                $content->removeAttribute($attr);
            }  
        }
        $ImgSrc = $content->src;
        //Ignore
        if(str_contains($ImgSrc, 'gif')){
            continue;
        }
        //Have to check image missing issue.
        if(!str_contains($ImgSrc, 'http')){
            $prefix = 'https://manuals.plus/';
        }
        
        $imagName = basename($ImgSrc);
        $contentOrFalseOnFailure   = file_get_contents($prefix.$ImgSrc);
        $filenameOut=   '../../wp-content/uploads/2022/05/' .$imagName;
        $byteCountOrFalseOnFailure = file_put_contents($filenameOut, $contentOrFalseOnFailure);
        //Update src of image paths
        if(str_contains($ImgSrc, 'http')){
            $content->src = $mysiteUrl. 'wp-content/uploads/2022/05/' .$imagName;
            $content->sizes ='';$content->srcset =''; 
            $content->alt = $imagName;
            
        }else if(!str_contains($ImgSrc, 'http')){
            $content->src = $mysiteUrl. 'wp-content/uploads/2022/05/' .$imagName;
            $content->sizes ='';$content->srcset =''; 
            $content->alt = $imagName;
        }  

    }  

    foreach($html->find('a') as $content) {
        foreach ($content->getAllAttributes() as $attr => $val) {
            if($attr !='href'){
                $content->removeAttribute($attr);
            }  
        } 

        $ImgSrc = $content->href;
        $imagName = basename($ImgSrc);
        if (str_contains($ImgSrc, '.pdf')) {

            if(!str_contains($ImgSrc, 'http')){
                $prefix = 'https://manuals.plus/';
            }

                $contentOrFalseOnFailure   = file_get_contents($prefix.$ImgSrc);
                $filenameOut=  '../../wp-content/uploads/2022/05/' .$imagName;
                $byteCountOrFalseOnFailure = file_put_contents($filenameOut, $contentOrFalseOnFailure);
            //Update anchor paths
            //if(str_contains($ImgSrc, 'http')){
                    $content->href = $mysiteUrl. 'wp-content/uploads/2022/05/' .$imagName;
            //}

        }
        else if(str_contains($ImgSrc, 'https://manuals.plus/')) {
            $content->href = str_replace('https://manuals.plus',$mysiteUrl,$ImgSrc);
        } 
    }

   //Remove Related Manuals

    if($html->find('[href="#related_manuals_resources"]')){
        $html->find('[href="#related_manuals_resources"]',0)->outertext = '';
    }
    if($html->find('.rp4wp-related-posts')){
        $html->find('.rp4wp-related-posts',0)->outertext = '';
    }

    //remove Documents image link
    if($html->find('table td')){
        $html->find('table td',0)->href="";
    } 

    

//save the variable
$html->save();

echo '<br/><br/><br/><br/>';
echo $title = $html->find('.breadcrumb_last',0)->plaintext;
echo '<br/><br/><br/><br/>';
echo $content = $html->find('.entry-content',0)->outertext;
echo '<br/><br/><br/><br/>';
echo $imagePath = $html->find('.post-thumbnail img',0)->src;
echo '<br/><br/><br/><br/>';

Leave a comment