<?php
// 页面错误配置
ini_set('display_errors', true);
error_reporting(E_ERROR);
set_time_limit(0);
// 开始处启用session
session_start();
// 定义文章目录url
$page = 'http://www.abcsee.net/book/72/72932/';
// 定义保存的文件名,转换为gbk后使在windows系统下显示中文文件名
$output = iconv("UTF-8", "GBK", './我的姥姥是半仙(全文版).txt');
// 获取当前页数
$p = (int)$_GET['p'];
// 获取目录数据
$list = get_lists($page, true);
// 第一页时创建标题
if($p == 0){
file_put_contents($output, '我的姥姥是半仙(全文版)' . PHP_EOL . PHP_EOL);
}
// 获取当前页的url和title
$url = $list[$p]['url'];
$title = $list[$p]['title'];
// 将指定页面的数据写入txt文件
if(false !== execute_content($page . $url, $title, $output)){
echo "第{$p}页<br><br>{$title}<br><br>数据采集完成!";
$p++;
$time = 1;
header("refresh:{$time};url=tools.php?p={$p}");
exit;
}
echo 'file append error!';
/**
* 生成文件内容
* @param $url
* @param null $title
* @param $output
* @return bool
*/
function execute_content($url, $title = null, $output)
{
// 调用http_get_content获取某页数据
$content = http_get_content($url, true);
// 正则匹配有用的数据
$txt = preg_match('/<dd id=\"contents\">(.*)<\/dd>/isU', $content, $temp) ? $temp[1] : "";
// 删除无用空格
$text = preg_replace("/(\r\n|\n|\r|\t)/i", '', str_replace('0 40px">', '', trim(html2text($txt))));
// 格式化需要输出的数据
$text = PHP_EOL . $title . PHP_EOL . $text . PHP_EOL;
// 写入文件后返回状态
return file_put_contents($output, $text, FILE_APPEND);
}
/**
* html转txt
* @param $str
* @return mixed|string
*/
function html2text($str)
{
$str = preg_replace("/<style .*?<\/style>/is", "", $str);
$str = preg_replace("/<script .*?<\/script>/is", "", $str);
$str = preg_replace("/<br \s*\/?\/>/i", "\n", $str);
$str = preg_replace("/<\/?p>/i", "\n\n", $str);
$str = preg_replace("/<\/?td>/i", "\n", $str);
$str = preg_replace("/<\/?div>/i", "\n", $str);
$str = preg_replace("/<\/?blockquote>/i", "\n", $str);
$str = preg_replace("/<\/?li>/i", "\n", $str);
$str = preg_replace("/\ \;/i", " ", $str);
$str = preg_replace("/\ /i", " ", $str);
$str = preg_replace("/\&\;/i", "&", $str);
$str = preg_replace("/\&/i", "&", $str);
$str = preg_replace("/\<\;/i", "<", $str);
$str = preg_replace("/\</i", "<", $str);
$str = preg_replace("/\&ldquo\;/i", '"', $str);
$str = preg_replace("/\&ldquo/i", '"', $str);
$str = preg_replace("/\&lsquo\;/i", "'", $str);
$str = preg_replace("/\&lsquo/i", "'", $str);
$str = preg_replace("/\&rsquo\;/i", "'", $str);
$str = preg_replace("/\&rsquo/i", "'", $str);
$str = preg_replace("/\>\;/i", ">", $str);
$str = preg_replace("/\>/i", ">", $str);
$str = preg_replace("/\&rdquo\;/i", '"', $str);
$str = preg_replace("/\&rdquo/i", '"', $str);
$str = strip_tags($str);
$str = html_entity_decode($str, ENT_QUOTES);
$str = preg_replace("/\&\#.*?\;/i", "", $str);
return $str;
}
/**
* 获取列表数据
* @param $page
* @param bool $cache
* @return array|string
*/
function get_lists($page, $cache = false)
{
// 定义记录在session中的key
$key = sha1($page);
$list = array();
// 获取内容数据
$html = http_get_content($page, true);
// 正则匹配出列表数据
$table = preg_match("/<table(.*)<\/table>/isU", $html, $temp) ? $temp[1] : "";
if(empty($table)){
return $table;
}
// 正则匹配出链接信息
$links = preg_match_all ('/<a href=\"(.*?)\".*?>(.*?)<\/a>/i', $table, $matches);
if ($links && isset($matches[1])){
// 格式化链接信息
foreach ($matches[1] as $key => $val){
$list[$key] = array(
'url' => $val,
'title' => $matches[2][$key]
);
}
// 数据记录到session中
if(!empty($list) && $cache){
$_SESSION[$key] = $list;
}
}
return $list;
}
/**
* 获取网页内容
* @param $url
* @param $cache
* @return mixed|string
*/
function http_get_content($url, $cache = false)
{
// 定义当前页面请求的cache key
$key = md5($url);
// 如果使用cache时只读一次
if($cache){
$file_contents = $_SESSION[$key];
if(!empty($file_contents)) return $file_contents;
}
// 通过curl模拟请求页面
$ch = curl_init();
// 设置超时时间
$timeout = 30;
curl_setopt($ch, CURLOPT_URL, $url);
// 以下内容模拟来源及代理还有agent,避免被dns加速工具拦截
curl_setopt($ch, CURLOPT_HTTPHEADER, array('X-FORWARDED-FOR:111.222.333.4', 'CLIENT-IP:111.222.333.4'));
curl_setopt($ch, CURLOPT_REFERER, "http://www.baidu.com");
//curl_setopt($ch, CURLOPT_PROXY, "http://111.222.333.4:110");
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$file_contents = curl_exec($ch);
curl_close($ch);
// 匹配出当前页的charset
$charset = preg_match("/<meta.+?charset=[^\w]?([-\w]+)/i", $file_contents, $temp) ? strtolower($temp[1]) : "";
//$title = preg_match("/<title>(.*)<\/title>/isU", $file_contents, $temp) ? $temp[1] : "";
// 非utf8编码时转码
if($charset != 'utf-8'){
$file_contents = iconv(strtoupper($charset), "UTF-8", $file_contents);
}
// 将结果记录到session中,方便下次直接读取
$_SESSION[$key] = $file_contents;
return $file_contents;
}