怎样从2000万开房数据中查询

怎样从2000万开房数据中查询 · Oct 29, 2013 clicks

前段时间2000万开房数据在网上闹得沸沸扬扬的,相信很多朋友都找到那些数据并下载下来了。
但是一下下来就傻眼了,大的是7、8G的SQL文件,稍小的也是别人导成CSV格式的也有3、4G之大。普通的文本工具一打开就崩溃,即使不崩溃,电脑内存瞬间吃光也会卡死。
借由这个契机,写了几个PHP小程序去查询,顺便回顾了一下PHP中的多进程和多线程编程。

前段时间2000万开房数据在网上闹得沸沸扬扬的,相信很多朋友都找到那些数据并下载下来了。
但是一下下来就傻眼了,大的是7、8G的SQL文件,稍小的也是别人导成CSV格式的也有3、4G之大。普通的文本工具一打开就崩溃,即使不崩溃,电脑内存瞬间吃光也会卡死。

借由这个契机,写了几个PHP小程序去查询,顺便回顾了一下PHP中的多进程和多线程编程。

常规编程,单线程,单进程,顺序执行。

<?php
class Searcher{
	protected $fileList = [];
		
	public function __construct()
	{
		$this->currentTasks[0] = 'main';
		$dir = dir( dirname(__FILE__).'/2000W' );
		while ( ($file = $dir->read() )!=FALSE ){
			if ( strpos($file,'.csv') >0 ){
				$this->fileList[] = dirname(__FILE__).'/2000W/' . $file;
			}
		}
		$dir->close();			
	}	
	
	
	
	public function run($word)
	{
		$start_time = time();
		foreach($this->fileList as $file){
			findInFile($file,$word);
		}
		echo "Finished in ", time()-$start_time , "seconds.\n";
	}
	
}
function findInFile($file,$word){
	echo "start searching $word in $file \n";
	$fp = fopen($file,'rb');
	$i = 0;
	while (!feof($fp)){
		$i ++;
		$line = fgets($fp);
		if ( strstr($line,$word) ){					
			echo "$file [$l]: $line \n ";
		}
	}
	fclose($fp);
	echo "Finish searching in $file \n ";
}


$obj = new Searcher();
$obj->run('张三');

现在尝试一下用pnctl扩展实现多进程:

<?php
class Searcher{
	protected $fileList = [];
		
	public function __construct()
	{
		$this->currentTasks[0] = 'main';
		$dir = dir( dirname(__FILE__).'/2000W' );
		while ( ($file = $dir->read() )!=FALSE ){
			if ( strpos($file,'.csv') >0 ){
				$this->fileList[] = dirname(__FILE__).'/2000W/' . $file;
			}
		}
		$dir->close();			
	}	
	
	public function runMultiProcess($word)
	{
		echo "Begin...\n";
		$start_time = time();
		foreach($this->fileList as $file){
			$lanched = $this->runProcess($file,$word);
		}
		//wait for all child processes to finish
		while ( pcntl_waitpid(0,$status) != -1){
			$status = pcntl_wexitstatus($status);
			echo "Child($status)  Exit.\n";
		}		
		$cost = time() -$start_time;
		echo "All tasks finished in $cost secends.\n";
		
	}
	
	protected function runProcess($file,$word)
	{
		$pid = pcntl_fork();
		if ($pid == -1 ){ //error occurs			
			echo "error occurs while fork new  task for [$file]\n";
			return false;
		}elseif($pid){ // parent process
			//echo "in parent process.\n";
			
		}else{//forked child
			$mypid = getmypid();
			echo "start child process $mypid to read file $file \n";
			$exitStatus = $mypid;
			findInFile($file,$word);				
			exit($exitStatus);
		}
		return true;
	}

	
}
function findInFile($file,$word){
	echo "start searching $word in $file \n";
	$fp = fopen($file,'rb');
	$i = 0;
	while (!feof($fp)){
		$i ++;
		$line = fgets($fp);
		if ( strstr($line,$word) ){					
			echo "$file [$l]: $line \n ";
		}
	}
	fclose($fp);
	echo "Finish searching in $file \n ";
}

$obj = new Searcher();
$obj->runMultiProcess('张三');

进程是个比较烦琐的东西,什么信号啊之类的是最难掌握的,更重要的是各进程之间相互通讯比较困难。

现在再试试基于pthreads扩展的多线程

<?php
class Searcher{
	protected $fileList = [];
		
	public function __construct()
	{
		$this->currentTasks[0] = 'main';
		$dir = dir( dirname(__FILE__).'/2000W' );
		while ( ($file = $dir->read() )!=FALSE ){
			if ( strpos($file,'.csv') >0 ){
				$this->fileList[] = dirname(__FILE__).'/2000W/' . $file;
			}
		}
		$dir->close();			
	}	
	
	public function runMultiThread($word)
	{
		$start_time = time();
		$ths = [];
		foreach($this->fileList as $i=>$file){
			$ths[$i] = new ThSearch($file,$word);
			$ths[$i]->start();
		}
		foreach($ths as $th){
			$th->join();
		}
		echo "Finished in ", time()-$start_time , "seconds.\n";
	}
		
}
function findInFile($file,$word){
	echo "start searching $word in $file \n";
	$fp = fopen($file,'rb');
	$i = 0;
	while (!feof($fp)){
		$i ++;
		$line = fgets($fp);
		if ( strstr($line,$word) ){					
			echo "$file [$l]: $line \n ";
		}
	}
	fclose($fp);
	echo "Finish searching in $file \n ";
}

class ThSearch extends Thread{
	protected $file;
	protected $word;
	public function __construct($file,$word)
	{
		$this->file = $file;
		$this->word = $word;		
	}
	public function run()
	{
		findInFile($this->file,$this->word);
	}
}

$obj = new Searcher();
$obj->runMultiThread('张三');

 

线程相比进程要轻得多,而且也更好控制。

也许你会发现这三种方法性能为什么都差不多? 因为在这个例子中,性能的瓶颈主要在磁盘IO,硬盘读写上限就那么多,自然性能也相差无几了。

PHP