Monday, October 5, 2009

Using pylucene to index audio files

Lucene is a quite efficient full-text indexing solution. I tried to use it to index my audio file tags to be able to launch mplayer or command line audio player without having to use complex and time consuming 'find' command to build playlists.

Here is a quick'n'dirty solution:
def indexFilesFromPath(dir, idxPath):
initVM(CLASSPATH)
writer = IndexWriter(idxPath, StandardAnalyzer(), True)
count = 1
for root, dirs, files in os.walk(dir):
for file in files:
#print "- "+str(root)+" "+file
sfx = file[-3:]
if sfx.lower() not in [ "mp3","mp4", "ogg", "flac" ]:
continue
filepath = os.path.join(root, file)
try:
doc = Document()
f = tagpy.FileRef(filepath)
title = unicode(f.tag().title)
artist = unicode(f.tag().artist)
album = unicode(f.tag().album)
genre = unicode(f.tag().genre)
path = unicode(filepath)
doc.add(Field("title", title, Field.Store.YES, Field.Index.TOKENIZED))
doc.add(Field("artist", artist, Field.Store.YES, Field.Index.TOKENIZED))
doc.add(Field("album",album, Field.Store.YES, Field.Index.TOKENIZED))
doc.add(Field("genre",genre, Field.Store.YES, Field.Index.TOKENIZED))
doc.add(Field("all", album + u" " + artist + u" " + title, Field.Store.YES, Field.Index.TOKENIZED))
doc.add(Field("path", path, Field.Store.YES, Field.Index.UN_TOKENIZED))
print "\r %5d files indexed"%count, # (%s/%s/%s)..."%(count, f.tag().artist, f.tag().album. f.tag().title)
count+=1
writer.addDocument(doc)
except Exception, e:
#print str(e)
# print "+"
continue
print "Done %d"%count
writer.optimize()
writer.close()
view raw gistfile1.py hosted with ❤ by GitHub


The search function is quite simple too:

def search( searchRequest, idxPath):
initVM(CLASSPATH)
fsDir = FSDirectory.getDirectory(idxPath, False)
searcher = IndexSearcher(fsDir)
language = StandardAnalyzer()
queryp = QueryParser('all', language)
query = queryp.parse(searchRequest) # "title", language)
hits = searcher.search(query)
print "# Found %d hits for %s"%(len(hits), searchRequest)
for i in range(0, hits.length()):
doc = hits.doc(i)
#print u"# %s - %s - %s"%(doc.getField('artist'), doc.getField('album'), doc.getField('title'))
print u"%s"%unicode(doc.getField('path').stringValue())
view raw gistfile1.py hosted with ❤ by GitHub


A quick demo:
Indexing music database:

time ./tsearch.py index /home/fv/music/SANE /home/fv/musicindex
16701 files indexed 
Done 16702
./tsearch.py index /home/fv/music/SANE /home/fv/musicindex  21,67s user 10,18s system 3% cpu 14:14,36 total

Searching:

time ./tsearch.py search "love OR hate" /home/fv/musicindex > playlist.m3u
./tsearch.py search "love OR hate" /home/fv/musicindex  0,52s user 0,09s system 14% cpu 4,310 total

A more complex search:


time ./tsearch search "love OR hate OR (title: rain in blood AND artist: slayer)" /home/fv/musicindex > playlist.m3u
./tsearch.py search  /home/fv/musicindex  0,48s user 0,07s system 74% cpu 0,730 total

Although this code sample is not perfect but consider it more as a proof of concept than a ready to use solution.

The full code:

#!/usr/bin/env python2.5
# vim:set fileencoding=utf-8
import os
import sys
import tagpy
from lucene import \
Document, IndexSearcher, FSDirectory, MultiFieldQueryParser, QueryParser, StandardAnalyzer, IndexWriter, \
StringReader, IndexReader, MoreLikeThis, Term, TermQuery, BooleanQuery,BooleanClause , Field, initVM, CLASSPATH
def indexFilesFromPath(dir, idxPath):
initVM(CLASSPATH)
writer = IndexWriter(idxPath, StandardAnalyzer(), True)
count = 1
for root, dirs, files in os.walk(dir):
for file in files:
#print "- "+str(root)+" "+file
sfx = file[-3:]
if sfx.lower() not in [ "mp3","mp4", "ogg", "flac" ]:
continue
filepath = os.path.join(root, file)
try:
doc = Document()
f = tagpy.FileRef(filepath)
title = unicode(f.tag().title)
artist = unicode(f.tag().artist)
album = unicode(f.tag().album)
genre = unicode(f.tag().genre)
path = unicode(filepath)
doc.add(Field("title", title, Field.Store.YES, Field.Index.TOKENIZED))
doc.add(Field("artist", artist, Field.Store.YES, Field.Index.TOKENIZED))
doc.add(Field("album",album, Field.Store.YES, Field.Index.TOKENIZED))
doc.add(Field("genre",genre, Field.Store.YES, Field.Index.TOKENIZED))
doc.add(Field("all", album + u" " + artist + u" " + title, Field.Store.YES, Field.Index.TOKENIZED))
doc.add(Field("path", path, Field.Store.YES, Field.Index.UN_TOKENIZED))
print "\r %5d files indexed"%count, # (%s/%s/%s)..."%(count, f.tag().artist, f.tag().album. f.tag().title)
count+=1
writer.addDocument(doc)
except Exception, e:
#print str(e)
# print "+"
continue
print "Done %d"%count
writer.optimize()
writer.close()
def search( searchRequest, idxPath):
initVM(CLASSPATH)
fsDir = FSDirectory.getDirectory(idxPath, False)
searcher = IndexSearcher(fsDir)
language = StandardAnalyzer()
queryp = QueryParser('all', language)
query = queryp.parse(searchRequest) # "title", language)
hits = searcher.search(query)
print "# Found %d hits for %s"%(len(hits), searchRequest)
for i in range(0, hits.length()):
doc = hits.doc(i)
#print u"# %s - %s - %s"%(doc.getField('artist'), doc.getField('album'), doc.getField('title'))
print u"%s"%unicode(doc.getField('path').stringValue())
if __name__ == "__main__":
if sys.argv[1] == "index":
indexFilesFromPath(sys.argv[2], sys.argv[3])
elif sys.argv[1] == "search":
search(sys.argv[2], sys.argv[3])
view raw tsearch.py hosted with ❤ by GitHub


The next step is to clean my audio files tags by retrieving tags from Last.fm webservice and put them in the "genre" tag. Then automatically retrieve songs lyrics and index it using the same method.