514482 ランダム
 HOME | DIARY | PROFILE 【フォローする】 【ログイン】

けむしろうの部屋別館

楽天テーマ検索用プログラム

高速化版楽天テーマ検索プログラム

目次ファイルと位置情報ファイルの作成

CRC32への変換
#!/usr/bin/perl

use Compress::Zlib;
use Text::ParseWords;
use Jcode;

$from = 'A-Za-z0-9a-z';
$to = 'A-ZA-Z0-9A-Z';

opendir(DIR, "./data") or die;
@files = sort readdir(DIR);
closedir(DIR);

foreach (@files) {
     if (/G*.csv/) {
         $fname = $_;
         $pname = "./data/" . $_;
         print $fname, "\n";
         open(FILE, $pname) or die;
         print "\t", tell(FILE), "\n";
         while (<FILE>) {
             @line = quotewords(",", 0, $_);
             $line[3] = Jcode->new($line[3])->tr($from, $to);
             $line[4] = Jcode->new($line[4])->tr($from, $to);
             @result =  `echo \'$line[3] $line[4]\' | chasen -F '%m\n'`;
             foreach (@result) {
                 if (!/^EOS$/) {
                     print "\t\t", crc32($_), "\n";
                 }
             }
             print "\t", tell, "\n";
         }
         close(FILE);
     }
}
インデックスファイルの作成
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define FILENAMELEN     7
#define BUFFERLEN       1024

#define TOCFILE         "toc.dat"
#define TABLEFILE       "table.dat"

typedef struct entry {
    unsigned long       key;
    char                filename[FILENAMELEN + 1];
    long                location;
} ENTRY;

typedef struct index {
    int         count;
    int         size;
    ENTRY       *entries;
} INDEX;

ENTRY *entries;

void realloc_entry(INDEX *index)
{
    if (index->size == 0) {
        index->size = 100;
        index->entries = (ENTRY *)malloc(sizeof(ENTRY) * index->size);
    }
    else {
        ENTRY *new_entries;
        index->size *= 2;
        new_entries = (ENTRY *)malloc(sizeof(ENTRY) * index->size);
        memcpy(new_entries, index->entries, sizeof(ENTRY) * index->size);
        free(index->entries);
        index->entries = new_entries;
    }
    if (index->entries == NULL) {
        printf("Memory overflow\n");
        exit(1);
    }
}

void add_entry(INDEX *index, unsigned long key, char *filename, long location)
{
    if (index->count == index->size) {
        realloc_entry(index);
    }
    index->entries[index->count].key = key;
    strncpy(index->entries[index->count].filename, filename, FILENAMELEN);
    index->entries[index->count].filename[FILENAMELEN] = '\0';
    index->entries[index->count].location = location;
    index->count++;
}

int cmpentry(const void *key1, const void *key2)
{
    ENTRY *e1 = (ENTRY *)key1;
    ENTRY *e2 = (ENTRY *)key2;

    if (e1->key > e2->key) {
        return 1;
    }
    else if (e1->key < e2->key) {
        return -1;
    }
    else {
        return 0;
    }
}

void write_index(INDEX *index)
{
    FILE *fp_toc;
    FILE *fp_table;
    unsigned long current_key = -1;
    int i;
    char sentinel_fname[FILENAMELEN + 1];
    long sentinel_location = 0;
    memset(sentinel_fname, '\0', sizeof(sentinel_fname));

    if ((fp_toc = fopen(TOCFILE, "w")) == NULL) {
        printf("can't open %s\n", TOCFILE);
        exit(1);
    }

    if ((fp_table = fopen(TABLEFILE, "w")) == NULL) {
        printf("can't open %s\n", TABLEFILE);
        exit(1);
    }

    for (i = 0; i < index->count; i++) {
        if (index->entries[i].key != current_key) {
            long location;
            if (i > 0) {
                fwrite(sentinel_fname, sizeof(sentinel_fname), 1, fp_table);
                fwrite(&sentinel_location, sizeof(long), 1, fp_table);
            }
            current_key = index->entries[i].key;
            location = ftell(fp_table);
            fwrite(&index->entries[i].key, sizeof(unsigned long), 1, fp_toc);
            fwrite(&location, sizeof(long), 1, fp_toc);
        }
        fwrite(index->entries[i].filename, sizeof(index->entries[i].filename), 1
, fp_table);
        fwrite(&index->entries[i].location, sizeof(long), 1, fp_table);
    }
    fclose(fp_table);
    fclose(fp_toc);
}

int main(int argc, char* argv[])
{
    char buffer[BUFFERLEN];
    INDEX index;
    char fname[FILENAMELEN + 1];
    long location;
    long key;
    int i;

    memset(buffer, '\0', BUFFERLEN);
    index.size = 0;
    index.count = 0;
    while (fgets(buffer, sizeof(buffer), stdin) != NULL) {
       if (buffer[0] != '\t') { // This line means filename.
           if (buffer[strlen(buffer) - 1] == '\n') {
               buffer[strlen(buffer) - 1] = '\0';
           }
           strncpy(fname, buffer, FILENAMELEN);
           fname[FILENAMELEN] = '\0';
           printf("%s\n", fname);
       }
       else if (buffer[1] != '\t') {
           location = strtoul(&buffer[1], NULL, 0);
       }
       else {
           key = strtoul(&buffer[2], NULL, 0);
           add_entry(&index, key, fname, location);
       }
    }
    qsort(index.entries, index.count, sizeof(ENTRY), cmpentry);
    write_index(&index);
}
単語の検索
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>

#define BUFFERLEN       1024
#define FILENAMELEN     7
#define MAXKEY          100

#define TOCFILE         "toc.dat"
#define TABLEFILE       "table.dat"

typedef struct entry {
    unsigned long key;
    long location;
} ENTRY;

typedef struct location {
    char filename[FILENAMELEN + 1];
    long location;
} LOCATION;

ENTRY *toc_entries;

int read_toc()
{
    struct stat statbuf;
    int toc_size;
    int fd;
    int i;

    if ((fd = open(TOCFILE, O_RDONLY)) == -1) {
        exit(1);
    }
    if (fstat(fd, &statbuf) == -1) {
        exit(1);
    }
    toc_size = statbuf.st_size / 8;

    toc_entries = (ENTRY *)malloc(sizeof(ENTRY) * toc_size);

    for (i = 0; i < toc_size; i++) {
        if (read(fd, &toc_entries[i].key, sizeof(unsigned long)) != sizeof(unsig
ned long)) {
            exit(1);
        }
        if (read(fd, &toc_entries[i].location, sizeof(long)) != sizeof(long)) {
            exit(1);
        }
    }
    close(fd);
    return toc_size;
}

int cmpentry(const void *key1, const void *key2)
{
    ENTRY *e1 = (ENTRY *)key1;
    ENTRY *e2 = (ENTRY *)key2;

    if (e1->key < e2->key) {
        return -1;
    }
    else if (e1->key > e2->key) {
        return 1;
    }
    else {
        return 0;
    }
}

void read_location(int fd, LOCATION *loc)
{
    int count;
    count = read(fd, loc->filename, 8);
    if (count != 8) {
        exit(2);
    }
    count = read(fd, &loc->location, 4);
    if (count != 4) {
        exit(3);
    }
}

int cmplocation(const void *key1, const void *key2)
{
    LOCATION *loc1 = (LOCATION *)key1;
    LOCATION *loc2 = (LOCATION *)key2;
    int strcmpresult;

    if ((strcmpresult = strcmp(loc1->filename, loc2->filename)) == 0) {
        if (loc1->location > loc2->location) {
            return 1;
        }
        else if (loc1->location < loc2->location) {
            return -1;
        }
        else {
            return 0;
        }
    }
    else {
        return strcmpresult;
    }
}

int normalize_set(LOCATION *set, int count)
{
    int i;
    int j;
    int new_count = count;

    qsort(set, count, sizeof(LOCATION), cmplocation);
    for (i = 0; i < count - 1; i++) {
        j = i + 1;
        while (cmplocation(&set[i], &set[j]) == 0) {
            j++;
        }
        if (j > i + 1) {
            memcpy(&set[i + 1], &set[j], count - j);
            new_count -= j - i - 1;
        }
    }
    return new_count;
}

void copy_location(LOCATION *dest, LOCATION *src)
{
    strcpy(dest->filename, src->filename);
    dest->location = src->location;
}

int main(int argc, char *argv[])
{
    long keys[MAXKEY];
    int num_of_keys = 0;
    LOCATION *lset1 = NULL;
    LOCATION *lset2 = NULL;
    int nlset;
    int nlset1;
    int nlset2;
    char buffer[BUFFERLEN];
    int toc_size;
    int fd;
    int i;
    int j;

    if ((fd = open(TABLEFILE, O_RDONLY)) == -1) {
        return 1;
    }

    toc_size = read_toc();

    while (fgets(buffer, sizeof(buffer), stdin) != NULL) {
        ENTRY search_entry;
        ENTRY *found_entry;
        ENTRY *next_entry;
        int cur;
        int found_key;

        search_entry.key = strtoul(buffer, NULL, 10);

        found_key = 0;
        for (i = 0; i < num_of_keys; i++) {
            if (keys[i] == search_entry.key) {
                found_key = 1;
                break;
            }
        }
        if (found_key) {
            continue;
        }

        if (num_of_keys == MAXKEY) {
            break;
        }

        keys[num_of_keys++] = search_entry.key;

        found_entry = (ENTRY *)bsearch(&search_entry,
                                       toc_entries,
                                       toc_size,
                                       sizeof(ENTRY),
                                       cmpentry);
        if (found_entry == NULL) {
            return 0;
        }
        next_entry = found_entry + 1;
        nlset = (next_entry->location - found_entry->location) / sizeof(LOCATION
) - 1;

        lseek(fd, found_entry->location, SEEK_SET);

        if (lset1 == NULL) {
            lset1 = (LOCATION *)malloc(sizeof(LOCATION) * nlset);
            for (i = 0; i < nlset; i++) {
                read_location(fd, &lset1[i]);
            }
            nlset1 = normalize_set(lset1, nlset);
            continue;
        }

        lset2 = (LOCATION *)malloc(sizeof(LOCATION) * nlset);
        for (i = 0; i < nlset; i++) {
            read_location(fd, &lset2[i]);
        }
        nlset2 = normalize_set(lset2, nlset);

        cur = 0;
        for (j = 0; j < nlset2; j++) {
            for (i = cur; i < nlset1; i++) {
                if (cmplocation(&lset1[i], &lset2[j]) == 0) {
                    if (i != cur) {
                        copy_location(&lset1[cur], &lset2[j]);
                    }
                    cur++;
                    break;
                }
            }
        }
        nlset1 = cur;
    }
    for (i = 0; i < nlset1; i++) {
        printf("%s %d\n", lset1[i].filename, lset1[i].location);
    }
    return 0;
}
ユーザインターフェース(抜粋)
function print_result($k, $group) {
  $descriptorspec = array(
      0 => array("pipe", "r"),
      1 => array("pipe", "w"),
      2 => array("file", "./error.log", "a")
  );

  $k = mb_strtoupper(mb_convert_kana($k, "KVA"));

  $pipes = array();
  $process = proc_open("./chasen -F '%m\n'", $descriptorspec, $pipes);
  if (is_resource($process)) {
    fwrite($pipes[0], $k . "\n");
    fclose($pipes[0]);

    $output = array();
    while(!feof($pipes[1])) {
      array_push($output, fgets($pipes[1], 1024));
    }
    fclose($pipes[1]);
    $return_value = proc_close($process);
    array_pop($output); // remove 'EOS'
    array_pop($output); // remove '\n'
  }
        
  $result = array();
  $process = proc_open("./search", $descriptorspec, $pipes);
  if (is_resource($process)) {
    foreach ($output as $hash) {
      fwrite($pipes[0], crc32($hash) . "\n");
    }
    fclose($pipes[0]);

    while(!feof($pipes[1])) {
      $line = fgets($pipes[1], 1024);
      $entry = preg_split('/ /', $line);
      if (count($entry) == 2) {
        if (($fp = @fopen("./data/" . $entry[0], "r")) != FALSE) {
          fseek($fp, $entry[1]);
          $data = fgetcsv($fp, 512);
          if (count($data) == 5) {
            array_push($result, $data);
          }
          fclose($fp);
        }
      }
    }
    fclose($pipes[1]);
    $return_value = proc_close($process);
  }
}


© Rakuten Group, Inc.