lucenenet-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Digy" <digyd...@gmail.com>
Subject Extending IndexModifier
Date Wed, 16 May 2007 19:26:48 GMT
Hi George,

I want to propose some extensions for IndexModifier.

1- Adding method "AddIndexes" asked by Dave (Sun, 13 May 2007 23:29:37 GMT).

2- Adding method "UpdateDocuments".

I have a program that performs the below task every night

	for each updated documents
		delete documents(term)
		add document(doc)
	endfor

But as mentioned in the document
(http://lucene.apache.org/java/1_9_0/api/org/apache/lucene/index/IndexModifi
er.html
 "While you can freely mix calls to add() and delete() using this class, you
should batch you calls for best performance. For example, if you want to
update 20 documents, you should first delete all those documents, then add
all the new documents. ")
it is not a good choise for coding.


Of course the programming logic can be changed, but i think it could be a
good solution for everyone to add a performance upgrade to the IndexModifier
where IndexModifiers defers the "AddDocument" method and continues to
perform "DeleteDocuments" until some threshold value is reached.

With this patch, my nightly job turns into

	for each updated documents
		UpdateDocuments(term,doc)
	Endfor


So, if you don't have any objection about extending classes of
Java-compatible-Lucene.Net, i/you can open an issue on JIRA.

I attached the patch for my proposed solution to the end of this e-mail.

DIGY




PATCH:
+-+-+-+-+-+-+-+-+-+-+- IndexModifier +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-
Index: IndexModifier.cs
===================================================================
--- IndexModifier.cs	(revision 536611)
+++ IndexModifier.cs	(working copy)
@@ -108,6 +108,11 @@
 		protected internal int maxBufferedDocs;
 		protected internal int maxFieldLength;
 		protected internal int mergeFactor;
+
+        public const int DEFAULT_MAX_WAITING_DOCS_TO_BE_INDEXED = 20;
+        public int MaxWaitingDocsToBeIndexed =
DEFAULT_MAX_WAITING_DOCS_TO_BE_INDEXED;
+
+        internal System.Collections.Generic.List<SingleDoc> DocsToIndex =
new System.Collections.Generic.List<SingleDoc>();
 		
 		/// <summary> Open an index with write access.
 		/// 
@@ -210,6 +215,7 @@
 			{
 				if (indexWriter != null)
 				{
+                    UpdateIndex(true);
 					indexWriter.Close();
 					indexWriter = null;
 				}
@@ -224,6 +230,7 @@
 			lock (directory)
 			{
 				AssureOpen();
+                UpdateIndex(true);
 				if (indexWriter != null)
 				{
 					indexWriter.Close();
@@ -540,6 +547,9 @@
 			{
 				if (!open)
 					throw new
System.SystemException("Index is closed already");
+
+                UpdateIndex(true);
+
 				if (indexWriter != null)
 				{
 					indexWriter.Close();
@@ -575,5 +585,81 @@
 		System.out.println(indexModifier.docCount() + " docs in
index");
 		indexModifier.close();
 		}*/
+        
+
+        /// <summary>
+        /// @link
Lucene.Net.Index.IndexWriter#AddIndexes(Lucene.Net.Index.IndexReader[]) 
+        /// </summary>
+        /// <param name="readers"></param>
+        public void AddIndexes(IndexReader[] readers)
+        {
+            AssureOpen();
+            CreateIndexWriter();
+            indexWriter.AddIndexes(readers);
+        }
+
+        /// <summary>
+        /// @link
Lucene.Net.Index.IndexWriter#AddIndexes(Lucene.Net.Store.Directory[]) 
+        /// </summary>
+        /// <param name="dirs"></param>
+        public void AddIndexes(Lucene.Net.Store.Directory[] dirs)
+        {
+            AssureOpen();
+            CreateIndexWriter();
+            indexWriter.AddIndexes(dirs);
+        }
+
+
+        private void UpdateIndex(bool IndexImmediately)
+        {
+            if (DocsToIndex.Count == 0) return;
+
+            if (IndexImmediately == true || DocsToIndex.Count >=
MaxWaitingDocsToBeIndexed)
+            {
+                foreach (SingleDoc sd in DocsToIndex)
+                {
+                    this.AddDocument(sd.Doc, sd.Analyzer);
+                }
+                DocsToIndex.Clear();
+            }
+        }
+
+        /// <summary>
+        /// Updates the index. First deletes the document(s) matching
"TermToDelete" then adds the "NewDoc" to the index.
+        /// IndexImmediately is used to defer the indexing for speed
optimization.
+        /// </summary>
+        /// <param name="TermToDelete"></param>
+        /// <param name="NewDoc"></param>
+        /// <param name="Analyzer"></param>
+        /// <param name="IndexImmediately">when false(default) Adding To
index is deferred until Document count waiting to be indexed reaches
MaxWaitingDocsToBeIndexed(Default=20)</param>
+        public void UpdateDocuments(Term TermToDelete, Document NewDoc,
Analyzer Analyzer, bool IndexImmediately)
+        {
+            this.DeleteDocuments(TermToDelete);
+            DocsToIndex.Add(new SingleDoc(NewDoc, Analyzer));
+            UpdateIndex(IndexImmediately);
+        }
+                
+        public void UpdateDocuments(Term TermToDelete, Document NewDoc,
Analyzer Analyzer)
+        {
+            UpdateDocuments(TermToDelete, NewDoc, Analyzer, false);
+        }
+
+        public void UpdateDocuments(Term TermToDelete, Document NewDoc)
+        {
+            UpdateDocuments(TermToDelete, NewDoc, null, false);
+        }
 	}
+
+    internal class SingleDoc
+    {
+        internal  Document Doc = null;
+        internal Analyzer Analyzer = null;
+
+        internal SingleDoc(Document Doc, Analyzer Analyzer)
+        {
+            this.Doc = Doc;
+            this.Analyzer = Analyzer;
+        }
+    }
+    
 }
\ No newline at end of file
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-



Mime
View raw message