source code †
#!/usr/bin/ruby
# -*- coding: utf-8 -*-
require 'rubygems'
require 'dbi'
# ----- file
DBNAME = 'DBI:mysql:koruri:localhost'
DBLOGIN = 'yourid'
DBPASS = 'yourpass'
VERB = false
# ---------------------------------
# ----- sub routine
def split_line(text)
output = []
# --- charconv
replace = text
replace.gsub!(/^>/,' ')
replace.gsub!(/^\[B\!\]/,' ')
replace.gsub!(/\([0-9]+\:[0-9]+\)/,' ')
replace.gsub!(/http/,' http')
#
chunk = text.split
line = ""
cr = false
chunk.each do |parts|
add = true
# reply
if parts =~ /^@/ then
cr = true
add = false
end
# retweet
if parts =~ /^RT / then
cr = true
add = false
end
if parts =~ /^QT / then
cr = true
add = false
end
if parts =~ /^RT:/ then
cr = true
add = false
end
# group
if parts =~ /^#/ then
cr = true
add = false
end
# URL
if parts =~ /^http/ then
cr = true
add = false
end
# period
cr = true
add = true
end
# --- add
if add == true then
line = line + ' ' + parts
end
if cr == true then
if (line.empty? == false) && (line.length > 3) then
output << line
line = ""
end
cr = false
end
end
if (line.empty? == false) && (line.length > 3) then
output << line
end
output
end
# ---------------------------------
# ----- main work
# --- open DB
db = DBI.connect(DBNAME, DBLOGIN, DBPASS)
# --- query
sql = 'SELECT id,text FROM tweet_log WHERE process=0'
item_num = 0
sth = db.execute(sql)
sth.each do |row|
sen = split_line(row[1])
db.transaction do
sen.each do |p|
sql_add = "insert into sentence values (NULL, ?, NULL, NULL, ?, 0, 0)"
db.do(sql_add, p, row[0])
item_num += 1
end
sql_update = "UPDATE tweet_log SET process=1 WHERE id=?"
db.do(sql_update, row[0])
end
end
sth.finish
printf("processed size = %d\n", item_num) if VERB == true
#done
db.disconnect