lucenenet-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From digy digy <digyd...@gmail.com>
Subject Re: Luke-0.9.x cannot open index files
Date Mon, 27 Apr 2009 07:48:02 GMT
It is not a bug of Lucene.Net and as my sample code shows, Lucene.Net works
well with chinese field names.
I think, it is a bug in Luke.

DIGY



On Mon, Apr 27, 2009 at 8:49 AM, Floyd Wu <floyd.wu@gmail.com> wrote:

> Hi Digy,
> Thanks for your help.
> But if chinese field name is the problem, will it be "fix" in Lucene.Net or
> how can I avoid this problem.
>
> Chinese field name is by design and probably not avoidable.
>
> Floyd
>
> 2009/4/25 Digy <digydigy@gmail.com>
>
> > I think, I found the bug. Here is the dump of the original index:
> >
> >
> >
> > NUMDOCS: 3
> >
> > MAXDOCS: 7
> >
> > DELETED(0): True
> >
> > DELETED(1): True
> >
> > DELETED(2): False
> >
> > DELETED(3): True
> >
> > DELETED(4): True
> >
> > DELETED(5): False
> >
> > DELETED(6): False
> >
> > TERM(0): _l_activationdatetime:552877632000000000
> >
> > TERM(1): _l_author:admin
> >
> > TERM(2): _l_bookmarkcount:0
> >
> > TERM(3): _l_clix:0
> >
> > TERM(4): _l_clix:1
> >
> > TERM(5): _l_creationdatetime:633427319866778624
> >
> > TERM(6): _l_creationdatetime:633427324812559872
> >
> > TERM(7): _l_creationdatetime:633760609388437504
> >
> > TERM(8): _l_deactivationdatetime:155377824000000000
> >
> > TERM(9): _l_deactivationdatetime:155378687999969792
> >
> > TERM(10): _l_document_class:1
> >
> > TERM(11): _l_document_class:98305
> >
> > TERM(12): _l_folder:163841
> >
> > TERM(13): _l_folder:163843
> >
> > TERM(14): _l_hidden:aaa
> >
> > TERM(15): _l_last_modified_datetime:633427319866778624
> >
> > TERM(16): _l_last_modified_datetime:633427324812559872
> >
> > TERM(17): _l_last_modified_datetime:633760609388437504
> >
> > TERM(18): _l_meta:abc
> >
> > TERM(19): _l_meta:abc.ppt
> >
> > TERM(20): _l_meta:ddx
> >
> > TERM(21): _l_meta:doc
> >
> > TERM(22): _l_meta:xyz
> >
> > TERM(23): _l_meta:名
> >
> > TERM(24): _l_meta:問
> >
> > TERM(25): _l_meta:有
> >
> > TERM(26): _l_meta:檔
> >
> > TERM(27): _l_meta:測
> >
> > TERM(28): _l_meta:看
> >
> > TERM(29): _l_meta:試
> >
> > TERM(30): _l_meta:還
> >
> > TERM(31): _l_meta:題
> >
> > TERM(32): _l_parentdocument:196609
> >
> > TERM(33): _l_parentdocument:327681
> >
> > TERM(34): _l_parentdocument:557057
> >
> > TERM(35): _l_ratingavg:0
> >
> > TERM(36): _l_ratingmedian:0
> >
> > TERM(37): _l_ratingstdev:0
> >
> > TERM(38): _l_ratingsum:0
> >
> > TERM(39): _l_read_permission:admin
> >
> > TERM(40): _l_rootdocument:196609
> >
> > TERM(41): _l_rootdocument:327681
> >
> > TERM(42): _l_rootdocument:557057
> >
> > TERM(43): _l_state:0
> >
> > TERM(44): _l_state:2
> >
> > TERM(45): _l_summary:2123456789
> >
> > TERM(46): _l_summary:abc
> >
> > TERM(47): _l_summary:abc.ppt
> >
> > TERM(48): _l_summary:ddx
> >
> > TERM(49): _l_summary:doc
> >
> > TERM(50): _l_summary:xyz
> >
> > TERM(51): _l_summary:有
> >
> > TERM(52): _l_summary:還
> >
> > TERM(53): _l_title:123
> >
> > TERM(54): _l_title:class
> >
> > TERM(55): _l_title:default
> >
> > TERM(56): _l_title:document
> >
> > TERM(57): _l_title:名
> >
> > TERM(58): _l_title:問
> >
> > TERM(59): _l_title:檔
> >
> > TERM(60): _l_title:測
> >
> > TERM(61): _l_title:看
> >
> > TERM(62): _l_title:試
> >
> > TERM(63): _l_title:題
> >
> > TERM(64): _l_unique_key:196609
> >
> > TERM(65): _l_unique_key:327681
> >
> > TERM(66): _l_unique_key:557057
> >
> > TERM(67): _l_version:1
> >
> > TERM(68): 作者:123
> >
> > TERM(69): 摘要:2123456789
> >
> > TERM(70): 摘要:abc
> >
> > TERM(71): 摘要:abc.ppt
> >
> > TERM(72): 摘要:ddx
> >
> > TERM(73): 摘要:doc
> >
> > TERM(74): 摘要:xyz
> >
> > TERM(75): 摘要:有
> >
> > TERM(76): 摘要:還
> >
> > TERM(77): 標題:123
> >
> > TERM(78): 標題:class
> >
> > TERM(79): 標題:default
> >
> > TERM(80): 標題:document
> >
> > TERM(81): 標題:名
> >
> > TERM(82): 標題:問
> >
> > TERM(83): 標題:檔
> >
> > TERM(84): 標題:測
> >
> > TERM(85): 標題:看
> >
> > TERM(86): 標題:試
> >
> > TERM(87): 標題:題
> >
> > TERM(88): 關鍵詞:123
> >
> >
> >
> >
> >
> >
> >
> > And here is a sample code: read docs from original index and then write
> to
> > an new one.
> >
> >
> >
> > void CreateNewIndex(string OrgIndex)
> >
> >        {
> >
> >            IndexReader reader = IndexReader.Open(OrgIndex);
> >
> >            IndexWriter writer = new IndexWriter("Floyd", new
> > Lucene.Net.Analysis.WhitespaceAnalyzer(),true);
> >
> >
> >
> >            for (int i = 0; i < reader.MaxDoc(); i++)
> >
> >            {
> >
> >                if (reader.IsDeleted(i) == true) continue;
> >
> >
> >
> >                Lucene.Net.Documents.Document orgDoc =
>  reader.Document(i);
> >
> >                System.Collections.IList fields = orgDoc.GetFields();
> >
> >
> >
> >                Lucene.Net.Documents.Document newDoc = new Document();
> >
> >                foreach (Lucene.Net.Documents.Field field in fields)
> >
> >                {
> >
> >                    Lucene.Net.Documents.Field newField = new Field(
> >
> >                        System.Convert.ToBase64String(
> > System.Text.Encoding.UTF8.GetBytes(field.Name())), //ç
> >
> >                        //field.Name(), //ç
> >
> >                        field.StringValue(),
> >
> >                        field.IsStored() ?
> > Lucene.Net.Documents.Field.Store.YES :
> Lucene.Net.Documents.Field.Store.NO<
> http://lucene.net.documents.field.store.no/>
> > ,
> >
> >                        field.IsTokenized() ?
> > Lucene.Net.Documents.Field.Index.TOKENIZED :
> > Lucene.Net.Documents.Field.Index.UN_TOKENIZED);
> >
> >
> >
> >                    newDoc.Add(newField);
> >
> >                }
> >
> >                writer.AddDocument(newDoc);
> >
> >            }
> >
> >
> >
> >            writer.Close();
> >
> >            reader.Close();
> >
> >        }
> >
> >
> >
> >
> >
> > If some field names are chinese, then Luke returns “read past EOF”. But
> if
> > those field names are replaced with non-chinese names, then it works.
> >
> >
> >
> > DIGY
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> > -----Original Message-----
> > From: Granroth, Neal V. [mailto:neal.granroth@thermofisher.com]
> > Sent: Friday, April 24, 2009 8:53 PM
> > To: lucene-net-dev@incubator.apache.org
> >  Subject: Luke-0.9.x cannot open index files
> >
> >
> >
> >
> >
> > Digy,
> >
> >
> >
> > Some additional information from the discussion on the lucene-net-user
> list
> > with Floyd Wu.
> >
> >
> >
> >
> >
> > I ran some further tests using Java Lucene 2.3.2 and JDK 1.5.
> >
> >
> >
> > The Java equivalents of the two small test applications I use to inspect
> an
> > index and compact it, function identically to the .NET versions (that
> were
> > built with VS2005 and Lucene.NET 2.3.1).
> >
> >
> >
> > That Luke cannot open the index appears to be a problem within Luke.
> >
> > Even if Floyd's index contains some odd entries, Java Lucene 2.3.2 does
> not
> > flag the index as corrupt; and both the Java and .NET versions report the
> > same index content before and after the optimize operation.
> >
> >
> >
> >
> >
> > -- Neal
> >
> >
> >
> > **************************************************************
> >
> > Neal Granroth
> >
> > Software Engineer, Molecular Spectroscopy
> >
> > Thermo Fisher Scientific
> >
> > 5225 Verona Road, Madison, WI 53711
> >
> >
> >
> > neal.granroth@thermofisher.com
> >
> > Tel: 608-276-5645
> >
> > Fax: 608-276-6328
> >
> >
> >
> > www.thermofisher.com
> >
> >
> >
> > WORLDWIDE CONFIDENTIALITY NOTE: Dissemination, distribution or copying of
> > this e-mail or the information herein by anyone other than the intended
> > recipient, or an employee or agent of a system responsible for delivering
> > the message to the intended recipient, is prohibited. If you are not the
> > intended recipient, please inform the sender and delete all copies.
> >
> >
> >
> > -----Original Message-----
> >
> > From: Digy (JIRA) [mailto:jira@apache.org]
> >
> > Sent: Wednesday, April 08, 2009 6:28 PM
> >
> > To: lucene-net-dev@incubator.apache.org
> >
> > Subject: [jira] Commented: (LUCENENET-169) Changes to make Lucene.NET
> > compatible with ASP.NET <http://asp.net/> Medium Trust Level, in hosting
> > environments (like GoDaddy...)
> >
> >
> >
> >
> >
> >    [
> >
> https://issues.apache.org/jira/browse/LUCENENET-169?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12697335#action_12697335
> ]
> >
> >
> >
> > Digy commented on LUCENENET-169:
> >
> > --------------------------------
> >
> >
> >
> > Although you can overcome all of them somehow;
> >
> >
> >
> > * controlling the the lifetime of IndexWriter/IndexReader in a naturally
> > manner,
> >
> > * reopening the IndexReader only when needed using (for ex)
> > FileSystemWatcher,
> >
> > * providing a separation between data & bussiness layer,
> >
> > * providing other apps an interface that may want to write its own user
> > interface,
> >
> > * accessing a single search service from different web apps/from load
> > balanced web servers
> >
> > * controlling the lifetime of searching/indexing code (without being
> > effected by the restart of the IIS processes automatically when some
> memory
> > limit is exceeded (for ex.) )
> >
> > * Ability to access some system resources that can be restricted by IIS
> >
> > etc.
> >
> > make me think a separete search service is a better idea.But at last, it
> is
> > a design decision of you.
> >
> > (Think, A WebApp+Solr in Java world)
> >
> >
> >
> >
> >
> > DIGY
> >
> >
> >
> > > Changes to make Lucene.NET compatible with ASP.NET <http://asp.net/>
> Medium Trust Level, in hosting environments (like GoDaddy...)
> >
> > >
> >
> -----------------------------------------------------------------------------------------------------------------
> >
> > >
> >
> > >                 Key: LUCENENET-169
> >
> > >                 URL:
> https://issues.apache.org/jira/browse/LUCENENET-169
> >
> > >             Project: Lucene.Net
> >
> > >          Issue Type: Improvement
> >
> > >         Environment: ASP.NET <http://asp.net/>
> >
> > >            Reporter: Corey Trager
> >
> > >         Attachments: FSDirectory.patch
> >
> > >
> >
> > >
> >
> > > Microsoft has a configuration file for shared hosting for what they
> call
> > "Medium Trust".   There are a couple places in FSDirectory.cs  that
> violate
> > the restrictions of Medium Trust, but I coded workarounds, shown below.
> >
> > > #1)
> >
> > > // Corey Trager, Oct 2008: Commented call to GetTempPath to workaround
> > permission restrictions at shared host.
> >
> > > // LOCK_DIR isn't used anyway.
> >
> > > public static readonly System.String LOCK_DIR = null; //
> > SupportClass.AppSettings.Get("Lucene.Net.lockDir",
> > System.IO.Path.GetTempPath());
> >
> > > #2)
> >
> > >               /// <summary>Returns an array of strings, one for each
> > Lucene index file in the directory. </summary>
> >
> > >               public override System.String[] List()
> >
> > >               {
> >
> > > /* Changes by Corey Trager, Oct 2008, to workaround permission
> > restrictions at shared host */
> >
> > >                System.IO.DirectoryInfo dir = new
> > System.IO.DirectoryInfo(directory.FullName);
> >
> > >               System.IO.FileInfo[] files = dir.GetFiles();
> >
> > >                 string[] list = new string[files.Length];
> >
> > >                 for (int i = 0; i < files.Length; i++)
> >
> > >                 {
> >
> > >                     list[i] = files[i].Name;
> >
> > >                 }
> >
> > >                 return list;
> >
> > > /* end of changes */
> >
> > > //            System.String[] files =
> > SupportClass.FileSupport.GetLuceneIndexFiles(directory.FullName,
> > IndexFileNameFilter.GetFilter());
> >
> > > //            for (int i = 0; i < files.Length; i++)
> >
> > > //            {
> >
> > > //                System.IO.FileInfo fi = new
> > System.IO.FileInfo(files[i]);
> >
> > > //                files[i] = fi.Name;
> >
> > > //            }
> >
> > > //                      return files;
> >
> > >               }
> >
> >
> >
> > --
> >
> > This message is automatically generated by JIRA.
> >
> > -
> >
> > You can reply to this email to add a comment to the issue online.
> >
> >
>

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message