I have taken an example of one website like how we can scrap its page content programmatically using simple html dom script.
NOTE: Download simplehtmldom script from here and include it:
<?php
include_once('../simple_html_dom.php');
//start scrapping html
$html = file_get_html('https://manuals.plus/ajax/000165-wireless-panic-button-and-remote-control-manual');
//$mysiteUrl = 'http://localhost/scrap/simplehtmldom/example/';
$mysiteUrl = 'http://localhost/manualsplus/';
$prefix='';
foreach($html->find('img') as $content) {
foreach ($content->getAllAttributes() as $attr => $val) {
if($attr !='src' && $attr !='width' && $attr !='height' && $attr !='alt'){
$content->removeAttribute($attr);
}
}
$ImgSrc = $content->src;
//Ignore
if(str_contains($ImgSrc, 'gif')){
continue;
}
//Have to check image missing issue.
if(!str_contains($ImgSrc, 'http')){
$prefix = 'https://manuals.plus/';
}
$imagName = basename($ImgSrc);
$contentOrFalseOnFailure = file_get_contents($prefix.$ImgSrc);
$filenameOut= '../../wp-content/uploads/2022/05/' .$imagName;
$byteCountOrFalseOnFailure = file_put_contents($filenameOut, $contentOrFalseOnFailure);
//Update src of image paths
if(str_contains($ImgSrc, 'http')){
$content->src = $mysiteUrl. 'wp-content/uploads/2022/05/' .$imagName;
$content->sizes ='';$content->srcset ='';
$content->alt = $imagName;
}else if(!str_contains($ImgSrc, 'http')){
$content->src = $mysiteUrl. 'wp-content/uploads/2022/05/' .$imagName;
$content->sizes ='';$content->srcset ='';
$content->alt = $imagName;
}
}
foreach($html->find('a') as $content) {
foreach ($content->getAllAttributes() as $attr => $val) {
if($attr !='href'){
$content->removeAttribute($attr);
}
}
$ImgSrc = $content->href;
$imagName = basename($ImgSrc);
if (str_contains($ImgSrc, '.pdf')) {
if(!str_contains($ImgSrc, 'http')){
$prefix = 'https://manuals.plus/';
}
$contentOrFalseOnFailure = file_get_contents($prefix.$ImgSrc);
$filenameOut= '../../wp-content/uploads/2022/05/' .$imagName;
$byteCountOrFalseOnFailure = file_put_contents($filenameOut, $contentOrFalseOnFailure);
//Update anchor paths
//if(str_contains($ImgSrc, 'http')){
$content->href = $mysiteUrl. 'wp-content/uploads/2022/05/' .$imagName;
//}
}
else if(str_contains($ImgSrc, 'https://manuals.plus/')) {
$content->href = str_replace('https://manuals.plus',$mysiteUrl,$ImgSrc);
}
}
//Remove Related Manuals
if($html->find('[href="#related_manuals_resources"]')){
$html->find('[href="#related_manuals_resources"]',0)->outertext = '';
}
if($html->find('.rp4wp-related-posts')){
$html->find('.rp4wp-related-posts',0)->outertext = '';
}
//remove Documents image link
if($html->find('table td')){
$html->find('table td',0)->href="";
}
//save the variable
$html->save();
echo '<br/><br/><br/><br/>';
echo $title = $html->find('.breadcrumb_last',0)->plaintext;
echo '<br/><br/><br/><br/>';
echo $content = $html->find('.entry-content',0)->outertext;
echo '<br/><br/><br/><br/>';
echo $imagePath = $html->find('.post-thumbnail img',0)->src;
echo '<br/><br/><br/><br/>';