怎样从2000万开房数据中查询

怎样从2000万开房数据中查询 · Oct 29, 2013 689 clicks

前段时间2000万开房数据在网上闹得沸沸扬扬的,相信很多朋友都找到那些数据并下载下来了。
但是一下下来就傻眼了,大的是7、8G的SQL文件,稍小的也是别人导成CSV格式的也有3、4G之大。普通的文本工具一打开就崩溃,即使不崩溃,电脑内存瞬间吃光也会卡死。
借由这个契机,写了几个PHP小程序去查询,顺便回顾了一下PHP中的多进程和多线程编程。

前段时间2000万开房数据在网上闹得沸沸扬扬的,相信很多朋友都找到那些数据并下载下来了。
但是一下下来就傻眼了,大的是7、8G的SQL文件,稍小的也是别人导成CSV格式的也有3、4G之大。普通的文本工具一打开就崩溃,即使不崩溃,电脑内存瞬间吃光也会卡死。

借由这个契机,写了几个PHP小程序去查询,顺便回顾了一下PHP中的多进程和多线程编程。

常规编程,单线程,单进程,顺序执行。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
<?php
class Searcher{
    protected $fileList = [];
         
    public function __construct()
    {
        $this->currentTasks[0] = 'main';
        $dir = dir( dirname(__FILE__).'/2000W' );
        while ( ($file = $dir->read() )!=FALSE ){
            if ( strpos($file,'.csv') >0 ){
                $this->fileList[] = dirname(__FILE__).'/2000W/' . $file;
            }
        }
        $dir->close();          
    }  
     
     
     
    public function run($word)
    {
        $start_time = time();
        foreach($this->fileList as $file){
            findInFile($file,$word);
        }
        echo "Finished in ", time()-$start_time , "seconds.\n";
    }
     
}
function findInFile($file,$word){
    echo "start searching $word in $file \n";
    $fp = fopen($file,'rb');
    $i = 0;
    while (!feof($fp)){
        $i ++;
        $line = fgets($fp);
        if ( strstr($line,$word) ){                
            echo "$file [$l]: $line \n ";
        }
    }
    fclose($fp);
    echo "Finish searching in $file \n ";
}
 
 
$obj = new Searcher();
$obj->run('张三');

现在尝试一下用pnctl扩展实现多进程:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
<?php
class Searcher{
    protected $fileList = [];
         
    public function __construct()
    {
        $this->currentTasks[0] = 'main';
        $dir = dir( dirname(__FILE__).'/2000W' );
        while ( ($file = $dir->read() )!=FALSE ){
            if ( strpos($file,'.csv') >0 ){
                $this->fileList[] = dirname(__FILE__).'/2000W/' . $file;
            }
        }
        $dir->close();          
    }  
     
    public function runMultiProcess($word)
    {
        echo "Begin...\n";
        $start_time = time();
        foreach($this->fileList as $file){
            $lanched = $this->runProcess($file,$word);
        }
        //wait for all child processes to finish
        while ( pcntl_waitpid(0,$status) != -1){
            $status = pcntl_wexitstatus($status);
            echo "Child($status)  Exit.\n";
        }      
        $cost = time() -$start_time;
        echo "All tasks finished in $cost secends.\n";
         
    }
     
    protected function runProcess($file,$word)
    {
        $pid = pcntl_fork();
        if ($pid == -1 ){ //error occurs           
            echo "error occurs while fork new  task for [$file]\n";
            return false;
        }elseif($pid){ // parent process
            //echo "in parent process.\n";
             
        }else{//forked child
            $mypid = getmypid();
            echo "start child process $mypid to read file $file \n";
            $exitStatus = $mypid;
            findInFile($file,$word);               
            exit($exitStatus);
        }
        return true;
    }
 
     
}
function findInFile($file,$word){
    echo "start searching $word in $file \n";
    $fp = fopen($file,'rb');
    $i = 0;
    while (!feof($fp)){
        $i ++;
        $line = fgets($fp);
        if ( strstr($line,$word) ){                
            echo "$file [$l]: $line \n ";
        }
    }
    fclose($fp);
    echo "Finish searching in $file \n ";
}
 
$obj = new Searcher();
$obj->runMultiProcess('张三');

进程是个比较烦琐的东西,什么信号啊之类的是最难掌握的,更重要的是各进程之间相互通讯比较困难。

现在再试试基于pthreads扩展的多线程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
<?php
class Searcher{
    protected $fileList = [];
         
    public function __construct()
    {
        $this->currentTasks[0] = 'main';
        $dir = dir( dirname(__FILE__).'/2000W' );
        while ( ($file = $dir->read() )!=FALSE ){
            if ( strpos($file,'.csv') >0 ){
                $this->fileList[] = dirname(__FILE__).'/2000W/' . $file;
            }
        }
        $dir->close();          
    }  
     
    public function runMultiThread($word)
    {
        $start_time = time();
        $ths = [];
        foreach($this->fileList as $i=>$file){
            $ths[$i] = new ThSearch($file,$word);
            $ths[$i]->start();
        }
        foreach($ths as $th){
            $th->join();
        }
        echo "Finished in ", time()-$start_time , "seconds.\n";
    }
         
}
function findInFile($file,$word){
    echo "start searching $word in $file \n";
    $fp = fopen($file,'rb');
    $i = 0;
    while (!feof($fp)){
        $i ++;
        $line = fgets($fp);
        if ( strstr($line,$word) ){                
            echo "$file [$l]: $line \n ";
        }
    }
    fclose($fp);
    echo "Finish searching in $file \n ";
}
 
class ThSearch extends Thread{
    protected $file;
    protected $word;
    public function __construct($file,$word)
    {
        $this->file = $file;
        $this->word = $word;    
    }
    public function run()
    {
        findInFile($this->file,$this->word);
    }
}
 
$obj = new Searcher();
$obj->runMultiThread('张三');

 

线程相比进程要轻得多,而且也更好控制。

也许你会发现这三种方法性能为什么都差不多? 因为在这个例子中,性能的瓶颈主要在磁盘IO,硬盘读写上限就那么多,自然性能也相差无几了。

PHP