diff --git a/prosem/prosem-ki.bib b/prosem/prosem-ki.bib index 1aff47e..0f1b828 100755 --- a/prosem/prosem-ki.bib +++ b/prosem/prosem-ki.bib @@ -1,422 +1,191 @@ -% This file was created with JabRef 2.9b2. -% Encoding: Cp1252 +% This file was created with JabRef 2.10b2. +% Encoding: UTF8 -@INPROCEEDINGS{Brin1998, - author = {Brin, Sergey and Page, Lawrence}, - title = {The Anatomy of a Large-Scale Hypertextual Web Search Engine}, - booktitle = {Seventh World Wide Web Conference}, - year = {1998}, - keywords = {World Wide Web, Search Engines, Information Retrieval, PageRank, Google}, - owner = {jim}, - quality = {1}, - timestamp = {2013.10.29} + +@InProceedings{Brin1998, + Title = {The Anatomy of a Large-Scale Hypertextual Web Search Engine}, + Author = {Brin, Sergey and Page, Lawrence}, + Booktitle = {Seventh World Wide Web Conference}, + Year = {1998}, + + Keywords = {World Wide Web, Search Engines, Information Retrieval, PageRank, Google}, + Owner = {jim}, + Quality = {1}, + Timestamp = {2013.10.29} } -@CONFERENCE{Clark2004, - author = {Clark, Stephen and Curran, James R.}, - title = {Parsing the {WSJ} using {CCG} and Log-Linear Models}, - booktitle = {Proceedings of the 42nd Annual Meeting of the Association for Computational - Linguistics}, - year = {2004}, - pages = {104-111}, - owner = {jim}, - quality = {1}, - timestamp = {2013.10.29} +@Conference{Clark2004, + Title = {Parsing the {WSJ} using {CCG} and Log-Linear Models}, + Author = {Clark, Stephen and Curran, James R.}, + Booktitle = {Proceedings of the 42nd Annual Meeting of the Association for Computational Linguistics}, + Year = {2004}, + Pages = {104-111}, + + Owner = {jim}, + Quality = {1}, + Timestamp = {2013.10.29} } -@INBOOK{Jurafsky2009, - chapter = {18}, - pages = {617--644}, - title = {Speech and Language Processing}, - publisher = {Pearson}, - year = {2009}, - author = {Jurafsky, Daniel and Martin, James H.}, - series = {Prentice-Hall series in artificial intelligence}, - edition = {Second}, - abstract = {Sentences get their meanings from the words they contain and the syntactic - order of the words. Therefore the meaning of a sentence is partially - based on the words and its syntactic structure. The composition of - meaning representation is guided by the syntactic components and - relations provided by grammars such as CFGs. - - - A meaning representation is generated by first sending the input through - a parser which results in the syntactic analysis and second passing - this analysis as input to a semantic analyzer. - - - In the syntax-driven semantic analysis it is assumed that syntactic, - lexical and anaphoric ambiguities are not a problem. - - - The semantic meanings are attached to the grammar rules and lexical - entries from which trees are generated in the first place. This is - called rule-to-rule hypothesis. - - - The semantic attachments are written in braces after the syntactic - rules themselves. - - - After the syntactic analysis has been created, every word receives - a FOL predicate and/or term. The semantic analyzer goes the tree - up until the complete FOL term has been created. On the way lambda - reduction is used to replace predicates and terms with their proper - meanings, received from other parts of the tree.}, - booktitle = {Speech and Language Processing}, - owner = {jim}, - quality = {1}, - timestamp = {2013.11.16} +@Conference{Gnjatovic2012, + Title = {A {Cognitively-Inspired} {Method} for {Meaning Representation} in {Dialogue Systems}}, + Author = {Gnjatović, Milan and Delić, Vlado}, + Booktitle = {3rd IEEE International Conference on Cognitive Infocommunications}, + Year = {2012}, + Month = {December}, + Pages = {383-388}, + + Owner = {jim}, + Timestamp = {2014.01.18} } -@INBOOK{Jurafsky2009a, - chapter = {17}, - pages = {579--616}, - title = {Speech and Language Processing}, - publisher = {Pearson}, - year = {2009}, - author = {Jurafsky, Daniel and Martin, James H.}, - series = {Prentice-Hall series in artificial intelligence}, - edition = {Second}, - abstract = {Lambda notation is used to bind variables dynamically to later appearing - contents. - - - lambda x P(x)(y) results in P(y) after a lambda reduction as x has - been bound to y. - - lambda P P(x)(lambda x Restaurant(x)) results in lambda x Restaurant(x)(x) - which results in Restaurant(x)}, - booktitle = {Speech and Language Processing}, - owner = {jim}, - quality = {1}, - timestamp = {2013.11.16} +@InBook{Jurafsky2009, + Title = {Speech and Language Processing}, + Author = {Jurafsky, Daniel and Martin, James H.}, + Chapter = {18}, + Pages = {617--644}, + Publisher = {Pearson}, + Year = {2009}, + Edition = {Second}, + Series = {Prentice-Hall series in artificial intelligence}, + + Abstract = {Sentences get their meanings from the words they contain and the syntactic order of the words. Therefore the meaning of a sentence is partially based on the words and its syntactic structure. The composition of meaning representation is guided by the syntactic components and relations provided by grammars such as CFGs. A meaning representation is generated by first sending the input through a parser which results in the syntactic analysis and second passing this analysis as input to a semantic analyzer. In the syntax-driven semantic analysis it is assumed that syntactic, lexical and anaphoric ambiguities are not a problem. The semantic meanings are attached to the grammar rules and lexical entries from which trees are generated in the first place. This is called rule-to-rule hypothesis. The semantic attachments are written in braces after the syntactic rules themselves. After the syntactic analysis has been created, every word receives a FOL predicate and/or term. The semantic analyzer goes the tree up until the complete FOL term has been created. On the way lambda reduction is used to replace predicates and terms with their proper meanings, received from other parts of the tree.}, + Booktitle = {Speech and Language Processing}, + Owner = {jim}, + Quality = {1}, + Timestamp = {2013.11.16} } -@INBOOK{Jurafsky2009b, - chapter = {13}, - pages = {461--492}, - title = {Speech and Language Processing}, - publisher = {Pearson}, - year = {2009}, - author = {Jurafsky, Daniel and Martin, James H.}, - series = {Prentice-Hall series in artificial intelligence}, - edition = {Second}, - owner = {jim}, - quality = {1}, - timestamp = {2013.11.17} +@InBook{Jurafsky2009a, + Title = {Speech and Language Processing}, + Author = {Jurafsky, Daniel and Martin, James H.}, + Chapter = {17}, + Pages = {579--616}, + Publisher = {Pearson}, + Year = {2009}, + Edition = {Second}, + Series = {Prentice-Hall series in artificial intelligence}, + + Abstract = {Lambda notation is used to bind variables dynamically to later appearing contents. lambda x P(x)(y) results in P(y) after a lambda reduction as x has been bound to y. lambda P P(x)(lambda x Restaurant(x)) results in lambda x Restaurant(x)(x) which results in Restaurant(x)}, + Booktitle = {Speech and Language Processing}, + Owner = {jim}, + Quality = {1}, + Timestamp = {2013.11.16} } -@CONFERENCE{Kessler1997, - author = {Kessler, Brett and Nunberg, Geoffrey and Schuetze, Hinrich}, - title = {Automatic Detection of Text Genre}, - booktitle = {Proceedings of the 35th Annual Meeting of the Association for Computational - Linguistics}, - year = {1997}, - pages = {32-38}, - owner = {jim}, - quality = {1}, - timestamp = {2013.10.29} +@InBook{Jurafsky2009b, + Title = {Speech and Language Processing}, + Author = {Jurafsky, Daniel and Martin, James H.}, + Chapter = {13}, + Pages = {461--492}, + Publisher = {Pearson}, + Year = {2009}, + Edition = {Second}, + Series = {Prentice-Hall series in artificial intelligence}, + + Owner = {jim}, + Quality = {1}, + Timestamp = {2013.11.17} } -@CONFERENCE{Klein2003, - author = {Klein, Dan and Smarr, Joseph and Nguyen, Huy and Manning, Christopher - D.}, - title = {Named Entity Recognition with Character-Level Models}, - booktitle = {Conference on Natural Learning (CoNLL)}, - year = {2003}, - pages = {180-183}, - owner = {jim}, - quality = {1}, - timestamp = {2013.10.29} +@Conference{Kessler1997, + Title = {Automatic Detection of Text Genre}, + Author = {Kessler, Brett and Nunberg, Geoffrey and Schuetze, Hinrich}, + Booktitle = {Proceedings of the 35th Annual Meeting of the Association for Computational Linguistics}, + Year = {1997}, + Pages = {32-38}, + + Owner = {jim}, + Quality = {1}, + Timestamp = {2013.10.29} } -@TECHREPORT{Paskin2001, - author = {Paskin, Mark A.}, - title = {Cubic-time Parsing and Learning Algorithms for Grammatical Bigram - Models}, - institution = {University of California}, - year = {2001}, - number = {UCB/CSD-01-1148}, - month = {June}, - abstract = {In Dependency Grammar there are head words and dependents. Each phrase - has only one head word. The head word determines how all of its dependents - may be syntactically combined with other words to form a sentence. - A head word and all of its dependents form a constituent. In every - sentence there may be one or more dependency relationships with one - head word each. - - Dependents that precede their head are called predependents and dependents - that follow their head are called postdependents. - - - A dependency parse consists of a set of dependency relationships that - satisfies three constraints: 1. Every word except one (the root) - is dependent to exactly one head. 2. The dependency relationships - are acyclic; no word is, through a sequence of dependency relationships, - dependent to itself. 3. When drawn as a graph above the sentence, - no two dependency relations cross - a property known as projectivity - or planarity. - - - The Grammatical Bigram Probability Model assumes that all the dependents - of a head word are independent of one another and their relative - order. This is a strong approximation as in full English there are - argument structure constraints that rely on the order of dependents. - This simplification allows for a reduced computational complexity - for parsing and learning. The grammar model falls into the class - of "Bilexical grammars". - - - A dependency parse consists of multiple spans. A span has at least - two words up to n words. Spans have one property: No word in the - span has a parent outside the span. Spans can be joined and closed. - To join the span one of them has to be connected (both end words - are connected with an edge) and both spans have to share one endword. - The new span will be connected if both subspans were connected. If - that is not the case, it can be closed by adding an edge between - the endwords of the new span. - - - Every dependency parse has a unique span decomposition. For joining - the left subspan has be simple. That means it has to have an edge - between its endwords or consist of two words only. Relying on this - ensures that each span is derived only once. - - - Every span has a signature. This signature states the indexes of its - endwords, if it is simple and whether the left or right endword have - parents within the span. Spans where both the left and right endword - have the parent within the string are called toplevel signatures - as such signatures characterize valid parses. - - - Parser operations take signatures as input rather than spans. They - produce signatures as well. SEED creates an unconnected and simple - span with two adjacent words. CLOSE-LEFT adds an edge between the - endwords and makes the left endword the parent of the right one. - CLOSE-RIGHT does the opposite and makes the right endword the parent - of the left one. These operators require that neither the left nor - the right endword have a parent within the span. - - - JOIN takes two input spans and joins them. It requires that the spans - share an endword (1.), the shared endword has one parent (2.) and - the left input is simple (3.). The JOIN rule applies only if the - left span doesn't start the sentence. - - - These operators constitute an algebra over span signatures called - span signature algebra. A derivation D is an expression in this algebra. - Like operations it evaluates to span signatures. These expressions - can be represented as trees where the nodes are operations. There - is an isomorphism between dependency parses and their corresponding - derivations. - - - Optimal derivation must consist of an operation over the results of - optimal sub-derivations. Therefore it is enough to record the parse - operation with the most likely derivation of a given signature in - order to reconstruct the most likely derivation of the entire sentence. - - - The chart-parse algorithm returns the optimal parse. It uses a subprocedure - called EXTRACT-OPT-PARSE that constructs the optimal parse by finding - the top-level signature (sigma) with maximum optimal probability - (pi*). It backtracks then recursively through the optimal derivation - defined by (omega*). If CLOSE operations are encountered edges are - recorded in the parse. The algorithm requires O(n³) time and O(n²) - space.}, - owner = {jim}, - quality = {1}, - timestamp = {2013.10.29} +@Conference{Klein2003, + Title = {Named Entity Recognition with Character-Level Models}, + Author = {Klein, Dan and Smarr, Joseph and Nguyen, Huy and Manning, Christopher D.}, + Booktitle = {Conference on Natural Learning (CoNLL)}, + Year = {2003}, + Pages = {180-183}, + + Owner = {jim}, + Quality = {1}, + Timestamp = {2013.10.29} } -@INBOOK{Russel2010, - chapter = {23}, - pages = {888--927}, - title = {Artificial intelligence: A Modern Approach}, - publisher = {Pearson}, - year = {2009}, - author = {Russel, Stuart J. and Norvig, Peter}, - series = {Prentice-Hall series in artificial intelligence}, - edition = {Third}, - abstract = {The first method to understanding natural language is syntactic analysis - or parsing. The goal is to find the phrase structure of a sequence - of words according to the rules of the applied grammar. - - A strict top-to-bottom or bottom-to-top parsing can be inefficient. - Given two sentences with the same first 10 words and a difference - only from the 11th word on, parsing from left-to-right would force - the parser to make a guess about the nature of the sentence. But - it doesn't know if it's right until the 11th word. From there it - had to backtrack and reanalyze the sentence. - - - To prevent that dynamic programming is used. Every analyzed substring - gets stored for later. Once it is discovered that for example "the - students in section 2 of Computer Science 101" is a noun phrase, - this information can be stored in a structure known as chart. Algorithms - that do such storing are called chart parsers. One of this chart - parsers is a bottom-up version called CYK algorithm after its inventors - John Cocke, Daniel Younger and Tadeo Kasami. This algorithm requires - a grammar in the Chomsky Normal Form. The algorithm takes O(n²m) - space for the P table with n being the number of words in the sentence - and m the number of nonterminal symbols in the grammar. It takes - O(n³m) time whereas m is constant for a particular grammar. That's - why it is commonly described as O(n³). There is no faster algorithm - for general context-free grammars. - - - The CYK algorithm only co mputes the probability of the most probable - tree. The subtrees are all represented in P table. - - - PCFGs (Probabilistic context free grammars) have many rules with a - probability for each one of them. Learning the grammar from data - is better than a knowledge engineering approach. Learning is easiest - if we are given a corpus of correctly parsed sentences; commonly - known as a treebank. The best known treebank is the Penn Treebank - as it consists of 3 million words which have been annotated with - part of speech and parse-tree structure. Given an amount of trees, - a PCFG can be created just by counting and smoothing. - - - If no treebank is given it is still possible to learn the grammar - but it is more difficult. In such a case there are actually two problems: - First learning the structure of the grammar rules and second learning - the probabilities associated with them. - - - PCFGs have the problem that they are context-free. Combining a PCFG - and Markov model will get the best of both. This leads ultimately - to lexicalized PCFGs. But another problem of PCFGs is there preference - for short sentences. - - - Lexicalized PCFGs introduce so called head words. Such words are the - most important words in a phrase and the probabilities are calculated - between the head words. Example: "eat a banana" "eat" is the head - of the verb phrase "eat a banana", whereas "banana" is the head of - the noun phrase "a banana". Probability P1 now depends on "eat" and - "banana" and the result would be very high. If the head of the noun - phrase were "bandanna", the result would be significantly lower. - - - The next step are definite clause grammars. They can be used to parse - in a way of logical inference and makes it possible to reason about - languages and strings in many different ways. Furthermore augmentations - allow for distinctions in a single subphrase. For example the noun - phrase (NP) depends on the subject case and the person and number - of persons. A real world example would be "to smell". It is "I smell", - "you smell", "we smell", "you smell" and "they smell" but "he/she/it - smells". It depends on the person what version is taken. - - - Semantic interpretation is used to give sentences a meaning. This - is achieved through logical sentences. The semantics can be added - to an already augmented grammar (created during the previous step), - resulting in multiple augmentations at the same time. Chill is an - inductive logic programming program that can learn to achieve 70% - to 85% accuracy on various database query tasks. - - - But there are several complications as English is endlessly complex. - First there is the time at which things happened (present, past, - future). Second you have the so called speech act which is the speaker's - action that has to be deciphered by the hearer. The hearer has to - find out what type of action it is (a statement, a question, an order, - a warning, a promise and so on). Then there are so called long-distance - dependencies and ambiguity. The ambiguity can reach from lexical - ambiguity where a word has multiple usages, over syntactic ambiguity - where a sentence has multiple parses up to semantic ambiguity where - the meaning of and the same sentence can be different. Last there - is ambiguity between literal meaning and figurative meanings. - - - Finally there are four models that need to be combined to do disambiguation - properly: the world model, the mental model, the language model and - the acoustic model. - - - -- not so much an abstract of the specific content of that section - as an abstract about speech recognition in general -- - - - The second method is speech recognition. It has the added difficulty - that the words are not clearly separated and every speaker can pronounce - the same sentence with the same meaning different. An example is - "The train is approaching". Another written form would be "The train's - approaching". Both convey the same meaning in the written language. - But if a BBC, a CNN and a german news anchor speeks this sentence - it will sound dramatically different. Speech recognition has to deal - with that problem to get the written text associated with the spoken - words. From the text the first method can than be used to analyze - the words and find a meaning. Finally this meaning can be used to - create some kind of action in a dialog system. - - - -- - - - Some problems of speech recognition are segmentation, coarticulation - and homophones. Two used models are the acoustic model and the language - model. Another major model is the noisy channel model, named after - Claude Shannon (1948). He showed that the original message can always - be recovered in a noisy channel if the original message is encoded - in a redundant enough way. - - - The acoustic model in particular is used to get to the really interesting - parts. It is not interesting how words were spoken but more what - words where spoken. That means that not all available information - needs to be stored and a relative low sample rate is enough. 80 samples - at 8kHz with a frame length of about 10 milliseconds is enough for - that matter. To distinguish words so called phones are used. There - are 49 phones used in English. A phoneme is the smallest unit of - sound that has a distinct meaning to speakers of a particular language. - Back to the frames: every frame is summarized by a vector of features. - Features are important aspects of a speech signal. It can be compared - to listening to an orchestra and saying "here the French horns are - playing loudly and the violins are playing softly". Yet another difficulty - are dialect variations. - - - The language model should be learned from a corpus of transcripts - of spoken language. But such a thing is more difficult than building - an n-gram model of text, because it requires a hidden Markov model. - - - All in all speech recognition is most effective when used for a specific - task against a restricted set of options. A general purpose system - can only work accurately if it creates one model for every speaker. - Prominent examples like Apple's siri are therefore not very accurate.}, - bookauthor = {Russel, Stuart J. and Norvig, Peter}, - booktitle = {Artificial intelligence: A Modern Approach}, - date = {December 11}, - owner = {jim}, - timestamp = {2013.10.24} +@Article{Li, + Title = {A Comparison of {CYK} and {Earley} {Parsing} {Algorithms}}, + Author = {Li, Te and Alagappan, Devi}, + + Institution = {Arizona State University}, + Owner = {jim}, + Timestamp = {2014.01.07} } -@INPROCEEDINGS{Sleator1993, - author = {Sleator, Daniel D. K. and Temperley, Davy}, - title = {Parsing English with a Link Grammar}, - booktitle = {Third Annual Workshop on Parsing technologies}, - year = {1993}, - owner = {jim}, - quality = {1}, - timestamp = {2013.10.29} +@TechReport{Paskin2001, + Title = {Cubic-time Parsing and Learning Algorithms for Grammatical Bigram Models}, + Author = {Paskin, Mark A.}, + Institution = {University of California}, + Year = {2001}, + Month = {June}, + Number = {UCB/CSD-01-1148}, + + Abstract = {In Dependency Grammar there are head words and dependents. Each phrase has only one head word. The head word determines how all of its dependents may be syntactically combined with other words to form a sentence. A head word and all of its dependents form a constituent. In every sentence there may be one or more dependency relationships with one head word each. Dependents that precede their head are called predependents and dependents that follow their head are called postdependents. A dependency parse consists of a set of dependency relationships that satisfies three constraints: 1. Every word except one (the root) is dependent to exactly one head. 2. The dependency relationships are acyclic; no word is, through a sequence of dependency relationships, dependent to itself. 3. When drawn as a graph above the sentence, no two dependency relations cross - a property known as projectivity or planarity. The Grammatical Bigram Probability Model assumes that all the dependents of a head word are independent of one another and their relative order. This is a strong approximation as in full English there are argument structure constraints that rely on the order of dependents. This simplification allows for a reduced computational complexity for parsing and learning. The grammar model falls into the class of "Bilexical grammars". A dependency parse consists of multiple spans. A span has at least two words up to n words. Spans have one property: No word in the span has a parent outside the span. Spans can be joined and closed. To join the span one of them has to be connected (both end words are connected with an edge) and both spans have to share one endword. The new span will be connected if both subspans were connected. If that is not the case, it can be closed by adding an edge between the endwords of the new span. Every dependency parse has a unique span decomposition. For joining the left subspan has be simple. That means it has to have an edge between its endwords or consist of two words only. Relying on this ensures that each span is derived only once. Every span has a signature. This signature states the indexes of its endwords, if it is simple and whether the left or right endword have parents within the span. Spans where both the left and right endword have the parent within the string are called toplevel signatures as such signatures characterize valid parses. Parser operations take signatures as input rather than spans. They produce signatures as well. SEED creates an unconnected and simple span with two adjacent words. CLOSE-LEFT adds an edge between the endwords and makes the left endword the parent of the right one. CLOSE-RIGHT does the opposite and makes the right endword the parent of the left one. These operators require that neither the left nor the right endword have a parent within the span. JOIN takes two input spans and joins them. It requires that the spans share an endword (1.), the shared endword has one parent (2.) and the left input is simple (3.). The JOIN rule applies only if the left span doesn't start the sentence. These operators constitute an algebra over span signatures called span signature algebra. A derivation D is an expression in this algebra. Like operations it evaluates to span signatures. These expressions can be represented as trees where the nodes are operations. There is an isomorphism between dependency parses and their corresponding derivations. Optimal derivation must consist of an operation over the results of optimal sub-derivations. Therefore it is enough to record the parse operation with the most likely derivation of a given signature in order to reconstruct the most likely derivation of the entire sentence. The chart-parse algorithm returns the optimal parse. It uses a subprocedure called EXTRACT-OPT-PARSE that constructs the optimal parse by finding the top-level signature (sigma) with maximum optimal probability (pi*). It backtracks then recursively through the optimal derivation defined by (omega*). If CLOSE operations are encountered edges are recorded in the parse. The algorithm requires O(n³) time and O(n²) space.}, + Owner = {jim}, + Quality = {1}, + Timestamp = {2013.10.29} } -@CONFERENCE{Smith2008, - author = {Smith, David A. and Eisner, Jason}, - title = {Dependency Parsing by Belief Propagation}, - booktitle = {Conference on Empirical Methods in Natural Language Processing}, - year = {2008}, - pages = {145-156}, - date = {October 25 - October 27}, - owner = {jim}, - quality = {1}, - timestamp = {2013.10.29} +@Misc{Portalarium2013, + Title = {Shroud of the {Avatar} {Six Month Progress Video}}, + + Author = {Portalarium}, + HowPublished = {\url{https://www.youtube.com/watch?v=yGa6hR4a87U}}, + Month = {November}, + Note = {Accessed on 18.01.2014 12:07}, + Year = {2013}, + + Owner = {jim}, + Timestamp = {2014.01.12} +} + +@InBook{Russel2010, + Title = {Artificial intelligence: A Modern Approach}, + Author = {Russel, Stuart J. and Norvig, Peter}, + Chapter = {23}, + Pages = {888--927}, + Publisher = {Pearson}, + Year = {2009}, + Edition = {Third}, + Series = {Prentice-Hall series in artificial intelligence}, + + Abstract = {The first method to understanding natural language is syntactic analysis or parsing. The goal is to find the phrase structure of a sequence of words according to the rules of the applied grammar. A strict top-to-bottom or bottom-to-top parsing can be inefficient. Given two sentences with the same first 10 words and a difference only from the 11th word on, parsing from left-to-right would force the parser to make a guess about the nature of the sentence. But it doesn't know if it's right until the 11th word. From there it had to backtrack and reanalyze the sentence. To prevent that dynamic programming is used. Every analyzed substring gets stored for later. Once it is discovered that for example "the students in section 2 of Computer Science 101" is a noun phrase, this information can be stored in a structure known as chart. Algorithms that do such storing are called chart parsers. One of this chart parsers is a bottom-up version called CYK algorithm after its inventors John Cocke, Daniel Younger and Tadeo Kasami. This algorithm requires a grammar in the Chomsky Normal Form. The algorithm takes O(n²m) space for the P table with n being the number of words in the sentence and m the number of nonterminal symbols in the grammar. It takes O(n³m) time whereas m is constant for a particular grammar. That's why it is commonly described as O(n³). There is no faster algorithm for general context-free grammars. The CYK algorithm only co mputes the probability of the most probable tree. The subtrees are all represented in P table. PCFGs (Probabilistic context free grammars) have many rules with a probability for each one of them. Learning the grammar from data is better than a knowledge engineering approach. Learning is easiest if we are given a corpus of correctly parsed sentences; commonly known as a treebank. The best known treebank is the Penn Treebank as it consists of 3 million words which have been annotated with part of speech and parse-tree structure. Given an amount of trees, a PCFG can be created just by counting and smoothing. If no treebank is given it is still possible to learn the grammar but it is more difficult. In such a case there are actually two problems: First learning the structure of the grammar rules and second learning the probabilities associated with them. PCFGs have the problem that they are context-free. Combining a PCFG and Markov model will get the best of both. This leads ultimately to lexicalized PCFGs. But another problem of PCFGs is there preference for short sentences. Lexicalized PCFGs introduce so called head words. Such words are the most important words in a phrase and the probabilities are calculated between the head words. Example: "eat a banana" "eat" is the head of the verb phrase "eat a banana", whereas "banana" is the head of the noun phrase "a banana". Probability P1 now depends on "eat" and "banana" and the result would be very high. If the head of the noun phrase were "bandanna", the result would be significantly lower. The next step are definite clause grammars. They can be used to parse in a way of logical inference and makes it possible to reason about languages and strings in many different ways. Furthermore augmentations allow for distinctions in a single subphrase. For example the noun phrase (NP) depends on the subject case and the person and number of persons. A real world example would be "to smell". It is "I smell", "you smell", "we smell", "you smell" and "they smell" but "he/she/it smells". It depends on the person what version is taken. Semantic interpretation is used to give sentences a meaning. This is achieved through logical sentences. The semantics can be added to an already augmented grammar (created during the previous step), resulting in multiple augmentations at the same time. Chill is an inductive logic programming program that can learn to achieve 70% to 85% accuracy on various database query tasks. But there are several complications as English is endlessly complex. First there is the time at which things happened (present, past, future). Second you have the so called speech act which is the speaker's action that has to be deciphered by the hearer. The hearer has to find out what type of action it is (a statement, a question, an order, a warning, a promise and so on). Then there are so called long-distance dependencies and ambiguity. The ambiguity can reach from lexical ambiguity where a word has multiple usages, over syntactic ambiguity where a sentence has multiple parses up to semantic ambiguity where the meaning of the same sentence can be different. Last there is ambiguity between literal meaning and figurative meanings. Finally there are four models that need to be combined to do disambiguation properly: the world model, the mental model, the language model and the acoustic model. -- not so much an abstract of the specific content of that section as an abstract about speech recognition in general -- The second method is speech recognition. It has the added difficulty that the words are not clearly separated and every speaker can pronounce the same sentence with the same meaning different. An example is "The train is approaching". Another written form would be "The train's approaching". Both convey the same meaning in the written language. But if a BBC, a CNN and a german news anchor speeks this sentence it will sound dramatically different. Speech recognition has to deal with that problem to get the written text associated with the spoken words. From the text the first method can than be used to analyze the words and find a meaning. Finally this meaning can be used to create some kind of action in a dialogue system. -- Some problems of speech recognition are segmentation, coarticulation and homophones. Two used models are the acoustic model and the language model. Another major model is the noisy channel model, named after Claude Shannon (1948). He showed that the original message can always be recovered in a noisy channel if the original message is encoded in a redundant enough way. The acoustic model in particular is used to get to the really interesting parts. It is not interesting how words were spoken but more what words where spoken. That means that not all available information needs to be stored and a relative low sample rate is enough. 80 samples at 8kHz with a frame length of about 10 milliseconds is enough for that matter. To distinguish words so called phones are used. There are 49 phones used in English. A phoneme is the smallest unit of sound that has a distinct meaning to speakers of a particular language. Back to the frames: every frame is summarized by a vector of features. Features are important aspects of a speech signal. It can be compared to listening to an orchestra and saying "here the French horns are playing loudly and the violins are playing softly". Yet another difficulty are dialect variations. The language model should be learned from a corpus of transcripts of spoken language. But such a thing is more difficult than building an n-gram model of text, because it requires a hidden Markov model. All in all speech recognition is most effective when used for a specific task against a restricted set of options. A general purpose system can only work accurately if it creates one model for every speaker. Prominent examples like Apple's siri are therefore not very accurate.}, + Bookauthor = {Russel, Stuart J. and Norvig, Peter}, + Booktitle = {Artificial intelligence: A Modern Approach}, + Date = {December 11}, + Owner = {jim}, + Timestamp = {2013.10.24} +} + +@InProceedings{Sleator1993, + Title = {Parsing English with a Link Grammar}, + Author = {Sleator, Daniel D. K. and Temperley, Davy}, + Booktitle = {Third Annual Workshop on Parsing technologies}, + Year = {1993}, + + Owner = {jim}, + Quality = {1}, + Timestamp = {2013.10.29} +} + +@Conference{Smith2008, + Title = {Dependency Parsing by Belief Propagation}, + Author = {Smith, David A. and Eisner, Jason}, + Booktitle = {Conference on Empirical Methods in Natural Language Processing}, + Year = {2008}, + Pages = {145-156}, + + Date = {October 25 - October 27}, + Owner = {jim}, + Quality = {1}, + Timestamp = {2013.10.29} } diff --git a/prosem/prosempaper.tex b/prosem/prosempaper.tex index 2f2704f..4d7fe55 100755 --- a/prosem/prosempaper.tex +++ b/prosem/prosempaper.tex @@ -27,7 +27,8 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Bind packages: \usepackage{acronym} % Acronyms -\usepackage{algorithmic} % Algorithms and Pseudocode +%\usepackage{algorithmic} % Algorithms and Pseudocode +\usepackage{algpseudocode} \usepackage{algorithm} % Algorithms and Pseudocode \usepackage{amsfonts} % AMS Math Packet (Fonts) \usepackage{amsmath} % AMS Math Packet @@ -60,6 +61,10 @@ \usepackage{tabularx} % Tables with fixed width but variable rows \usepackage{url,xspace,boxedminipage} % Accurate display of URLs +\usepackage{float} +\floatstyle{boxed} +\restylefloat{figure} + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Configurationen: @@ -173,7 +178,7 @@ Syntactic parsing and semantic analysis are two important methods for understanding natural language. Each of them has their individual strengths and weaknesses. But both of them have major issues with ambiguity once a restricted environment is left. Understanding unrestricted natural language is therefore far from being reached. % Lists: -\setcounter{tocdepth}{2} % depth of the table of contents (for Seminars 2 is recommented) +\setcounter{tocdepth}{2} % depth of the table of contents (for Seminars 2 is recommended) \tableofcontents \pagenumbering{arabic} \clearpage @@ -190,31 +195,157 @@ It's the dream of many Science-Fiction fans: A fully sentient AI. Let's ignore for a moment all the odds that are against it (morality, physics, etc.) and concentrate on one aspect that is mandatory for even much less ambitious dreams. Imagine a computer game in which you can talk natural language to the NPC counterparts so that they react appropriately to it. Well maybe that is still too ambitious. What about writing what you want to say? In that case the computer needs to understand what you are writing so that it can react to it. - This process of understanding natural language contains multiple methods. The first one is the syntactic parsing, the second one the semantic analysis. Syntactic parsing relies on a grammar that describes the set of possible input, also called syntax. The syntax specifies what are allowed sentence structures and how these are built. - The semantic analysis relies on the semantics of a given input. That means what the given input means. An example: ``You run around the bush''. The semantic meaning of this sentence is that you are running around a bush. The pragmatics define what is the intended meaning of an input. In this example it's not that you run around the bush but actually that you take a long time to get to the point in a discussion. It's a so called idiom. This difference between semantic meaning, where just the sentence as it is written is considered, and pragmatic meaning, where the intended meaning is considered, generates ambiguity that is easy for humans to resolve but difficult for computers. But even the pragmatics in this example are ambigious, because it depends on the context what it actually means. If two persons are walking around in a forest and one starts running around the bush, the sentence of this example, would have the semantic meaning as it's pragmatic meaning. + The input in this case is plain text, following the grammar of a natural language like English. Without loss of generality it is assumed that the input is syntactically correct and follows the grammar of the natural language. The computer therefore gets a certain amount of text that follows a specified grammar. The grammar of modern English is assumed for the scope of this paper. With this information available, the computer still knows nothing about the meaning of the text. You could ask for a hot chocolate or you could write nasty things, it won't make a difference at this point. + + In order to make the computer react properly to your input, it needs to understand and therefore process the input in the first place. This can be achieved by the usage of some methods for natural language understanding. For the scope of this paper ``natural language understanding'' contains all the methods used for understanding natural language. These include both methods to understand written natural language and methods used to understand spoken natural language. This paper looks closer at two of the methods used to understand written language. The first one is the syntactic parsing, the second one the semantic analysis. To understand how these methods work, you need to know the basic terminology of the subject matter. In the following paragraphs the terms syntax, semantics and pragmatics are explained with respect to the two mentioned methods. + + The first method syntactic parsing relies on a grammar that describes the set of possible input, also called syntax. The syntax specifies what are allowed sentence structures and how these are built. + + The semantic analysis relies on the semantics of a given input. That means what the given input means. An example: ``You run around the bush''. The semantic meaning of this sentence is that you are running around a bush. + The pragmatics though define what is the intended meaning of an input. In this example it's not that you run around the bush but actually that you take a long time to get to the point in a discussion. It's a so called idiom. This difference between semantic meaning, where just the sentence as it is written is considered, and pragmatic meaning, where the intended meaning is considered, generates ambiguity that is easy for humans to resolve but difficult for computers. But even the pragmatics in this example are ambiguous, because it depends on the context what it actually means. If two persons are walking around in a forest and one starts running around the bush, the pragmatic meaning of the sentence in this example would be the previously mentioned semantic meaning. On top of that the semantic meaning itself isn't always clear either. Sometimes words have multiple meanings, so that even the semantic meaning can have different possible interpretations. + The basic terminology should be clear by now. Whenever there are additional prerequisites to understand a method, these are explained in the section of that method. + + Before the actual evaluation of the methods starts, the usage of the result of both methods is shortly described. After both syntactic parsing and semantic analysis have been executed, in this order, you have a semantic representation of the input. This representation could be used for example for an interface to a knowledge database where the user just inserts the question and gets an appropriate answer. + + But there are other possible use cases as well. The two described methods could be used in a chatbot. + In this paper both syntactic parsing and semantic analysis are presented. After the presentation of the methods, they are critically discussed to finally come to a conclusion. \section{Evaluation of methods} \label{sec:evalMethods} + + Syntactic parsing and semantic analysis offer each a broad range of approaches. In this paper the ``syntax-driven semantic analysis''\cite[p.~617]{Jurafsky2009} is evaluated. It's especially interesting because it utilizes the output of the syntactic parsing to analyze the meaning. Therefore the two methods can be lined up in chronological order. First comes the syntactic parsing and then the semantic analysis. The methods are presented here in the same order. + + They will be explained with the help of an example. Let's take the sentence ``The tree is very high''. For every method the theory is introduced first and the practical application with the example comes after it. + \subsection{Syntactic Parsing} \label{subSec:syntacticParsing} - Syntactic Parsing is used to create parse trees. These can be used for grammar checks in a text editor: ``A sentence that cannot be parsed may have grammatical errors''\cite[p.~461]{Jurafsky2009b}. But they more likely ``serve as an important intermediate stage of representation for semantic analysis''\cite[p.~461]{Jurafsky2009b}. There are different algorithms available to create such trees. The CYK\footnote{named after inventors John Cocke, Daniel Younger and Tadeo Kasami} algorithm will be explained further. But before the CYK algorithm is explained, the reason for its existance is presented. + Syntactic Parsing is used to create parse trees. These can be used for grammar checks in a text editor: ``A sentence that cannot be parsed may have grammatical errors''\cite[p.~461]{Jurafsky2009b}. But they more likely ``serve as an important intermediate stage of representation for semantic analysis''\cite[p.~461]{Jurafsky2009b}. There are different algorithms available to create such trees. The CYK\footnote{named after inventors John Cocke, Daniel Younger and Tadeo Kasami\cite[p.~893]{Russel2010}} algorithm will be explained further. But before the CYK algorithm is explained, the reason for its existance is presented. + + \begin{figure} + \begin{alignat*}{2} + Noun &\rightarrow && \text{tree [1.00]} \\ + Verb &\rightarrow && \text{is [1.00]} \\ + Adjective &\rightarrow && \text{high [0.50]} \;|\; \text{very [0.50]} \\ + Article &\rightarrow && \text{the [1.00]} \\ + \end{alignat*} + \caption{The lexicon for $\varepsilon_{0}$. The sum of the probabilities for each category is 1.} + \label{fig:lexicon} + \end{figure} + + \begin{figure} + \begin{alignat*}{3} + \varepsilon_{0}:& S \;&\rightarrow &\; NP\;\;VP \;&[1.00]&\; \text{The tree + is very high} \\ + & NP \;&\rightarrow &\; A\;N \;&[1.00]&\; \text{The + tree}\\ + & A \;&\rightarrow &\; Article\;&[1.00]&\; \text{the}\\ + & N \;&\rightarrow &\; Noun\;&[1.00]&\; \text{tree}\\ + & VP \;&\rightarrow &\; Verb \;&[0.40]&\; \text{is} \\ + & \;&|&\; VP\;Adjs \;&[0.60]&\; \text{is + very high} \\ + & Adjs \;&\rightarrow &\; Adjective \;&[0.80]&\; \text{very} \\ + & \;&|&\; Adj\;Adjs \;&[0.20]&\; \text{very + high} + \end{alignat*} + + \caption{The grammar for $\varepsilon_{0}$ with example phrases for each rule. The syntactic categories are sentence (S), noun phrase (NP), verb phrase (VP), article (A), noun (N) and list of adjectives (Adjs). The categories article and noun have been added to allow a CNF grammar.} + \label{fig:grammar} + \end{figure} + + \subsubsection*{Bottom-up and Top-down} + \label{subSubSec:bottomUpTopDown} There are two classical ways of parsing a sentence. The one is bottom-up and the other one is top-down. Both approaches have their own advantages and disadvantages. In addition the ambiguity creates problems. To implement bottom-up and top-down search algorithms in the face of ambiguity, ``an agenda-based backtracking strategy''\cite[p.~468]{Jurafsky2009b} is used. The problem here is that every time the parser recognizes that the current parse tree is wrong, it has to backtrack and explore other parts of the sentence. This creates a huge amount of work duplication and is therefore inefficient. + \subsubsection*{CYK algorithm} + \label{subSubSec:cykAlgorithm} + A solution to these problems is offered by ``dynamic programming parsing methods''\cite[p.~469]{Jurafsky2009b}. The CYK algorithm is one of multiple algorithms based on dynamic programming. The CYK does only work with grammars in the Chomsky Normal Form (CNF). Every context-free grammar can be converted to CNF without loss in expressiveness. Therefore this restriction does no harm but simplifies the parsing. For information on how context-free grammars can be converted to CNF, refer to Jurafsky\cite{Jurafsky2009b}. - CYK requires $\mathcal{O}(n^{2}m)$ space for the $P$ table (a table with probabilities), where ``$m$ is the number of nonterminal symbols in the grammar''\cite[p.~893]{Russel2010}, and uses $\mathcal{O}(n^{3}m)$ time. ``$m$ is constant for a particular grammar, [so it] is commonly described as $\mathcal{O}(n^{3})$''\cite[p.~893]{Russel2010}. There is no algorithm that is better than CYK for general context-free grammars\cite{Russel2010}. + CYK requires $\mathcal{O}(n^{2}m)$ space for the $P$ table (a table with probabilities), where ``$m$ is the number of nonterminal symbols in the grammar''\cite[p.~893]{Russel2010}, and uses $\mathcal{O}(n^{3}m)$ time. ``$m$ is constant for a particular grammar, [so it] is commonly described as $\mathcal{O}(n^{3})$''\cite[p.~893]{Russel2010}. But these values are of no value if there is no benchmark. How good is $\mathcal{O}(n^{3})$ in comparison? To give a better idea of the relations, here a small comparison to the ``Earley Algorithm''\cite[p.~477]{Jurafsky2009b}. The Earley algorithm performs better with all unambiguous grammars.\cite{Li} It has the same upper bound in time but in most cases it is quicker. Furthermore it has a space complexity of $\mathcal{O}(n)$ which is definitely better than CYK.\cite{Li} For ambiguous grammars though the Earley algorithm uses more space than CYK and the real space used is dependent on the length of the input.\cite{Li} In time complexity the CYK algorithm can only compete with Earley if ambiguous grammars are used.\cite{Li} But CYK is still of use for parsing of natural language, because natural language grammars are always ambiguous. Therefore there is no algorithm that is better than CYK for general context-free grammars.\cite{Russel2010} - But how does CYK work? CYK doesn't examine all parse trees. It just examines the most probable one and computes the probability of that tree. All the other parse trees are present in the $P$ table and could be enumerated with a little work (in exponential time). But the strength and beauty of CYK is, that they don't have to be enumerated. CYK defines ``the complete state space defined by the `apply grammar rule' operator''\cite[p.~894]{Russel2010}. You can search just a part of this space with $A^{*}$ search.\cite{Russel2010} ``With the $A^{*}$ algorithm [...] the first parse found will be the most probable''\cite[p.~895]{Russel2010}. + But how does CYK work? CYK doesn't examine all parse trees. It just examines the most probable one and computes the probability of that tree. All the other parse trees are present in the $P$ table and could be enumerated with a little work (in exponential time). But the strength and beauty of CYK is, that they don't have to be enumerated. CYK defines ``the complete state space defined by the `apply grammar rule' operator''\cite[p.~894]{Russel2010}. You can search just a part of this space with $A^{*}$ search.\cite{Russel2010} ``With the $A^{*}$ algorithm [...] the first parse found will be the most probable''\cite[p.~895]{Russel2010}. The actual pseudo code can be found in figure 23.5 in Russel\cite[p.~894]{Russel2010}. - But these probabilities need to be learned from somewhere. This somewhere is usually a ``treebank''\cite[p.~895]{Russel2010}, which contains a corpus of correctly parsed sentences. The best known is the Penn Treebank\cite{Russel2010}, which ``consists of 3 million words which have been annotated with part of speech and parse-tree structure, using human labor assisted by some automated tools''\cite[p.~895]{Russel2010}. The probabilities are then computed by counting and smoothing in the given data.\cite{Russel2010} There are other ways to learn the probabilities that are more difficult. For more information refer to Russel\cite{Russel2010}. + \subsubsection*{Treebank} + \label{subSubSec:treebank} + But these probabilities need to be learned from somewhere. This somewhere is usually a ``treebank''\cite[p.~895]{Russel2010}, which contains a corpus of correctly parsed sentences. The best known is the Penn Treebank\cite{Russel2010}, which ``consists of 3 million words which have been annotated with part of speech and parse-tree structure, using human labor assisted by some automated tools''\cite[p.~895]{Russel2010}. The probabilities are then computed by counting and smoothing in the given data.\cite{Russel2010} There are other ways to learn the probabilities that are more difficult. For more information refer to Russel\cite{Russel2010}. + + \subsubsection*{Application} + \label{subSubSec:application} + + Now it is time to use the CYK algorithm with our example. For this case a restricted language called $\varepsilon_{0}$ is defined that is suitable to form one sentence about a tree. Next a lexicon (figure \ref{fig:lexicon}), ``or list of allowable words''\cite[p.~890]{Russel2010}, is defined. Furthermore a grammar (figure \ref{fig:grammar}) for $\varepsilon_{0}$ is defined. The lexicon and the grammar are based upon the lexicon and grammar in figures 23.1 and 23.2 of \cite{Russel2010} respectively. + + The CYK algorithm is given the words and the grammar and returns the table $P$ containing the probabilities for the whole sentence and it's subsequences.\cite{Russel2010} The pseudo code can bee seen in algorithm \ref{alg:cyk}. + + \begin{algorithm} + \caption{Application of CYK for our problem} + \label{alg:cyk} + \begin{algorithmic}[1] + \Procedure{CYK-Parse}{$words, grammar$} + \State $N \gets \Call{Length}{Words}$\Comment{N = 5} + \State $M \gets$ the number of nonterminal symbols in $grammar$\Comment{M = 6} + \State $P \gets$ an array of size [M, N, N], initially all 0 + \For{$i = 1$ to $N$} + \ForAll{rules of form ($X \rightarrow words_{i}[p]$)} + \State $P[X, i, 1] \gets p$ + \EndFor + \EndFor + \For{$length = 2$ to $N$} + \For{$start = 1$ to $N - length + 1$} + \For{$len1 = 1$ to $N - 1$} + \State $len2 \gets length - len1$ + \ForAll{rules of the form ($X \rightarrow$ $Y$ $Z$ [$p$])} + \State $P[X, start, length] \gets \Call{Max}{P[X, start, length], P[Y, start, len1] \times P[Z, start + len1, len2] \times p}$ + + \EndFor + \EndFor + \EndFor + \EndFor + \State \Return $P$ + \EndProcedure + \end{algorithmic} + \end{algorithm} + + The resulting P table is depicted in table \ref{tab:p}. As you can see in the table there is just one possible parse for the whole sentence. In linear form the sentence can be parsed as [$S$ [$NP$ [$A$ the] [$N$ tree]][$VP$ [$VP$ is][$Adjs$ [$Adjs$ very][$Adjs$ high]]]. With this information given a parse tree could be easily constructed. + + \begin{table} + \caption{Table of probabilities from the CYK parse. The entries with probability 0 have been left out.} + \label{tab:p} + \centering + \begin{tabular}{|c|c|c|c|} + \hline + X & start & length & p \\ + \hline + \hline + A & 1 & 1 & 1.00 \\ + \hline + N & 2 & 1 & 1.00 \\ + \hline + VP & 3 & 1 & 0.40 \\ + \hline + Adjs & 4 & 1 & 0.80 \\ + \hline + Adjs & 5 & 1 & 0.80 \\ + \hline + NP & 1 & 2 & 1.00 \\ + \hline + VP & 3 & 2 & 0.192 \\ + \hline + Adjs & 4 & 2 & 0.128 \\ + \hline + S & 1 & 3 & 0.40 \\ + \hline + VP & 3 & 3 & 0.09216 \\ + \hline + S & 1 & 4 & 0.192 \\ + \hline + S & 1 & 5 & 0.09216 \\ + \hline + \end{tabular} + \end{table} \subsection{Semantic Analysis} \label{subSec:semanticAnalysis} @@ -222,9 +353,15 @@ Therefore context-free grammar rules are augmented with ``semantic attachments''\cite[p.~618]{Jurafsky2009}. Every word and syntactic structure in a sentence gets such a semantic attachment. The tree with syntactic components is now traversed in a bottom-up manner. On the way the semantic attachments are combined to finally produce ``First-Order Logic''\cite[p.~589]{Jurafsky2009a} that can be interpreted in a meaningful way. This procedure has some prerequisites that will be explained first. + \subsubsection*{First-Order Logic} + \label{subSubSec:firstOrderLogic} + The mentioned \textit{First-Order Logic} can be represented by a context-free grammar specification. It is beyond this paper to describe this specification completely. Jurafsky\cite{Jurafsky2009a} provides a detailed picture of the specification with all elements in figure 17.3. The most important aspects of this specification are explained here. The logic provides terms which can be functions, constants and variables. Functions have a term as argument. Syntactically they are the same as single-argument predicates. But functions represent one unique object. Predicates can have multiple terms as arguments. In addition the logic provides quantifiers ($\forall, \exists$) and connectives ($\wedge, \vee, \Rightarrow$). + \subsubsection*{Lambda notation} + \label{subSubSec:lambdaNotation} + Another prerequisite is the ``lambda notation''\cite[p.~593]{Jurafsky2009a}. A simple example of this notation is an expression of the following form\footnote{examples taken from Jurafsky\cite[pp.~593-594]{Jurafsky2009a}}: \[ \lambda x.P(x) @@ -237,7 +374,7 @@ &P(A)& \end{alignat*} - Those expressions can be extended to $n$ such $\lambda$s. An example is this expression: + Those expressions can be extended to $n$ such $\lambda$s. An example is this expression (where $x$ and $y$ denote things from which a distance to each other can be calculated): \[ \lambda x.\lambda y.Near(x,y) \] @@ -251,61 +388,122 @@ This technique is called ``currying''\cite[p.~594]{Jurafsky2009a} and is used to convert ``a predicate with multiple arguments into a sequence of single-argument predicates''\cite[p.~594]{Jurafsky2009a}. - After the prerequisites are now explained, it is time to start with the actual syntax-driven semantic analysis. It will be shown with an example provided by Jurafsky. Assume the sentence \textit{Every restaurant closed}. ``The target representation for this example should be the following''\cite[p.~621]{Jurafsky2009}. + \subsubsection*{Syntax-driven semantic analysis} + \label{subSubSec:syntaxDrivenSemanticAnalysis} - \begin{equation} - \label{eq:tarRep} - \forall x \,Restaurant(x) \Rightarrow \exists e \,Closed(e) \wedge ClosedThing(e,x) - \end{equation} + After the prerequisites are now explained, it is time to start with the actual syntax-driven semantic analysis. It is shown with the previously introduced example and starts there, where the syntactic parsing left. - The first step is to determine what the meaning representation of \textit{Every restaurant} should be. \textit{Every} is responsible for the $\forall$ quantifier and \textit{restaurant} specifies the category over which is quantified. This is called the ``restriction''\cite[p.~622]{Jurafsky2009} of the noun phrase. The meaning representation could be $\forall x\,Restaurant(x)$. It is a valid logical formula but it doesn't make much sense. ``It says that everything is a restaurant.''\cite[p.~622]{Jurafsky2009} ``Noun phrases like [this] are [usually] embedded in expressions that [say] something about the universally quantified variable. That is, we're probably trying to \textit{say something} about all restaurants. This notion is traditionally referred to as the \textit{NP}'s nuclear scope''\cite[p.~622]{Jurafsky2009}. In the given example, the nuclear scope is \textit{closed}. To represent this notion in the target representation, a dummy predicate $Q$ is added, which results in this expression: - \[ - \forall x\,Restaurant(x) \Rightarrow Q(x) - \] - To replace $Q$ with something meaningful, the $\lambda$ notation is needed. - \[ - \lambda Q.\forall x\,Restaurant(x) \Rightarrow Q(x) - \] - After more generalization, this is the result: - \[ - \lambda P.\lambda Q.\forall x\,P(x) \Rightarrow Q(x) - \] - What happened? The descriptor \textit{every} gets this last expression as semantic attachment. The noun \textit{restaurant} gets $\lambda x.Restaurant(x)$. When combined, the second expression is the result. The verb is still missing. Therefore the verb \textit{closed} gets the following expression. - \[ - \lambda x.\exists e\,Closed(e) \wedge ClosedThing(e,x) - \] - After combining the formulas of the verb and the noun phrase, the previously shown target representation\eqref{eq:tarRep} is the result. + The grammar rules have to be augmented with the semantic attachments. This process goes through all involed rules in a bottom-up way. To remind you of the sentence, here it is again: ``The tree is very high''. The first rule is the $A$ rule, which produces ``The''. The article implies that are is exactly one entity which is therefore easily identified. If there were multiple entities of the same type, ``the'' won't be enough to specify which entity is meant. + + \[ + A \rightarrow the \;\{\lambda x.\lambda P.\exists x.P(x)\} + \] + + The next rule is the one responsible for ``tree''. + + \[ + N \rightarrow tree \;\{\lambda x.Tree(x)\} + \] + + NP is the combination of the two previous rules and will therefore have to combine the meaning somehow. This is done by using the N semantic attachment as argument for the A attachment. + + \[ + NP \rightarrow A\;N \;\{A.sem(N.sem)\} + \] + + Next are the Adjs rules. The first is the one that handles ``very''. + + \[ + Adjs \rightarrow very \;\{\lambda x.Very(x)\} + \] + + It states that there is an adjective that is increased in meaning by ``very''. The rule for ``high'' gets augmented with an attachment that describes a thing that is high. + + \[ + Adjs \rightarrow high \;\{\lambda x.HighThing(a, x)\} + \] + + Here the x stands for an entity that is high. Another Adjs rule brings these two together. To differentiate the semantic attachments of the previous two Adjs rules, the affected adjective is noted in square brackets. + + \[ + Adjs \rightarrow Adjs\;Adjs \;\{\lambda x.\exists a.Adjs[very].sem(a) \wedge Adjs[high].sem(x) \} + \] + + The VP rules are next to be augmented. First comes the one that is responsible for ``is''. ``is'' implies that there is an entity with a state. + + \[ + VP \rightarrow is \;\{\lambda P.\lambda Q.Q(x) \Rightarrow P(x)\} + \] + + In the next step the VP rule for ``is'' and the adjectives are combined. Here applies the same as for the Adjs case. As there are two VP rules, the ``is'' rule is identified by the verb in square brackes. As there are no square brackets after Adjs in the following attachment, the Adjs rule, that combines the adjective rules, is meant. + + \[ + VP \rightarrow VP\;Adjs \;\{VP[is].sem(Adjs.sem)\} + \] + + Last comes the S rule that combines both NP and VP. VP means the rule that combines the ``is'' rule and the Adjs rules. + + \[ + S \rightarrow NP\;\;VP \;\{VP.sem(NP.sem)\} + \] + + With the semantic attachments in place, the final meaning representation can easily be retrieved. The replacement happens from top to bottom, by starting with the semantic attachment from S and going down to the semantic attachments of the rules that produce the actual output. The critical part is the $\lambda$-reduction in the first VP rule. The intermediate steps are shown below. + + \begin{alignat*}{1} + \lambda P.\lambda Q.Q(x) \Rightarrow P(x)(\lambda x.\exists a.Adjs[very].sem(a) \wedge Adjs[high].sem(x)) \\ + \lambda Q.Q(x) \Rightarrow \lambda x.\exists a.Adjs[very].sem(a) \wedge Adjs[high].sem(x)(x) \\ + \lambda Q.Q(x) \Rightarrow \exists a.Adjs[very].sem(a) \wedge Adjs[high].sem(x) \\ + \lambda Q.Q(x) \Rightarrow \exists a.\lambda x.Very(x)(a) \wedge Adjs[high].sem(x) \\ + \lambda Q.Q(x) \Rightarrow \exists a.Very(a) \wedge \lambda x.HighThing(a, x)(x) \\ + \lambda Q.Q(x) \Rightarrow \exists a.Very(a) \wedge HighThing(a, x) \\ + \intertext{inserting the NP attachments} + \lambda Q.Q(x) \Rightarrow \exists a.Very(a) \wedge HighThing(a, x)(\lambda x.\lambda P.\exists x.P(x)) \\ + \lambda x.\lambda P.\exists x.P(x)(x) \Rightarrow \exists a.Very(a) \wedge HighThing(a, x)(\lambda x.Tree(x)) \\ + \exists x.\lambda x.Tree(x)(x) \Rightarrow \exists a.Very(a) \wedge HighThing(a, x) \\ + \exists x.Tree(x) \Rightarrow \exists a.Very(a) \wedge HighThing(a, x) + \end{alignat*} + + The final meaning representation is therefore the following. + + \[ + \exists x.Tree(x) \Rightarrow \exists a.Very(a) \wedge HighThing(a, x) + \] + + If you translate this logic into words, it'd be something similar to this: ``There is a tree that is a high thing, a very high thing''. This complete run-through from the syntactic parsing up to the meaning representation hopefully showed the two presented methods in action and let you understand them better. Furthermore this overarching example should have put you in the situation to follow a critical discussion, which is next in the paper. - This example is just one of many, but it shows how semantic meaning can be attached to syntactic components. Furthermore it should be clear now, how semantic analysis in a syntax-driven approach works. \section{Critical discussion} \label{sec:critDiscussion} + %TODO back up every claim (reference after first sentence) + Now that both methods have been presented with one selected approach each, it is time to discuss them critically. The CYK algorithm solves many problems like ambiguity; at least to a certain degree. But it also is problematic, because of the restriction to CNF. While in theory every context-free grammar can be converted to CNF, in practice it poses ``some non-trivial problems''\cite[p.~475]{Jurafsky2009b}. One of this problems can be explored in conjunction with the second presented method (semantic analysis). ``[T]he conversion to CNF will complicate any syntax-driven approach to semantic analysis''\cite[p.~475]{Jurafsky2009b}. A solution to this problem is some kind of post-processing in which the trees are converted back to the original grammar.\cite{Jurafsky2009b} Another option is to use a more complex dynamic programming algorithm that accepts any kind of context-free grammar. Such an algorithm is the ``Earley Algorithm''\cite[p.~477]{Jurafsky2009b}. - The syntax-driven semantic analysis, as it has been presented, is a powerful method that is easy to understand. But it has one essential problem. It relies upon an existing set of grammar rules with semantic attachments to them. In a real world example such a table would contain thousands of grammar rules. While it is relatively easy to compute the final meaning representation with such a given table, it is very hard to create the table in the first place. The difficulty to create this table is split into two main issues. The first one being that you must find a grammar specification that fits all your use cases. This problem applies for the syntactic parsing as well. The second issue is that one has to find out the semantic attachments to the grammar rules. + The syntax-driven semantic analysis, as it has been presented, is a powerful method that is easy to understand. But it has one essential problem. It relies upon an existing set of grammar rules with semantic attachments to them. In a real world example such a table would contain thousands of grammar rules.\cite{Russel2010} While it is relatively easy to compute the final meaning representation with such a given table, it is very hard to create the table in the first place. The difficulty to create this table is split into two main issues. The first one being that you must find a grammar specification that fits all your use cases. This problem applies for the syntactic parsing as well. The second issue is that one has to find out the semantic attachments to the grammar rules. - This initial workload to create a state, in which the semantic analysis works, is a unique effort. In a restricted environment with a limited set of words and topics, this workload is of low importance. Even if it takes one month to create such a table by hand or by computing it, the subsequent analysis of input based on this table is rather quick and the initial workload is therefore acceptable. But this is only true for restricted environments. If someone tried to use syntax-driven semantic analysis for the complete language of modern English, the creation of such a table would outweigh any possible usage. + This initial workload to create a state, in which the semantic analysis works, is a unique effort.\cite{Jurafsky2009} A restricted environment has a limited set of words and topics compared to an unrestricted environment. An example is a flight check-in automaton that only needs to process a subset of the full English grammar. Therefore this workload is of low importance in such an environment. Even if it takes one month to create such a table by hand or by computing it, the subsequent analysis of input based on this table is rather quick and the initial workload is therefore acceptable. But this is only true for restricted environments. If someone tried to use syntax-driven semantic analysis for the complete language of modern English, the creation of such a table would outweigh any possible usage. + + %TODO three options: add reference to claim, introduce necessary knowledge prior to this point or drop it Comparing the complexity of the two methods it shows a mirror-like image. For the parsing the creation of the grammar is comparatively easy. The presented CYK algorithm works with context-free grammars which are a very restricted set compared to natural languages. But even within these context-free grammars there are ambiguities inside the texts themselves. The creation of the parse trees is therefore more of a problem. - Syntax-driven semantic analysis on the other hand requires a decent amount of work to add semantic attachments to grammar rules. But once this has been done, it works very fast. + Syntax-driven semantic analysis on the other hand requires a decent amount of work to add semantic attachments to grammar rules.\cite{Jurafsky2009} But once this has been done, it works very fast. - Both methods require a unique work for one specific usage. This unique workload is the grammar creation for the parsing and the extension of the grammar with semantic attachments for the semantic analysis. The less restricted the usage environment, the more complex the initial workload becomes. The same is true for the recurring workload inside one specific usage. + Both methods require an initial workload for every usage domain. This unique workload is the grammar creation for the parsing and the extension of the grammar with semantic attachments for the semantic analysis. The less restricted the usage environment, the more complex the initial workload becomes. The same is true for the recurring workload for every actual usage inside one usage domain. - Judging by the state-of-the-art of computer technology, parsing does still pose a significant challenge once the restricted field of programming languages is left. The semantic analysis as the second method in the chain has therefore even more problems to date. As the presented syntax-driven approach does only work with parse trees, a semantic analysis can only be undertaken once the syntactic parsing succeeds. + Judging by the state-of-the-art of computer technology, parsing does still pose a significant challenge once the restricted field of programming languages is left. The semantic analysis as the second method in the chain has therefore even more problems to date. As the presented syntax-driven approach does only work with syntactic representations\cite{Jurafsky2009}, a semantic analysis can only be undertaken once the syntactic parsing succeeds. The ambiguity remains one of the bigges issues for both methods. Especially the syntax-driven semantic analysis does only consider the semantic meaning alone. It's not it's fault as the analysis doesn't know the context. The presented approach looks at each sentence in a sandbox. The generated meaning representations are therefore only of limited use for a less restricted grammar. \section{Conclusion} \label{sec:concl} - Syntactic parsing is an important method on the way to understand natural language. The usage of dynamic programming algorithms circumvents many of the issues that classical top-down or bottom-up parsing algorithms face. Ambiguity is the most prominent of those issues. The best algorithm for context-free grammars is the CYK algorithm, which is a dynamic programming algorithm. But in practice it is very restricted, because it only works with grammars in CNF. But there are more complex dynamic programming algorithms that allow any kind of context-free grammar. + Syntactic parsing is an important method on the way to understand natural language. The usage of dynamic programming algorithms circumvents many of the issues that classical top-down or bottom-up parsing algorithms face. Ambiguity is the most prominent of those issues. The best algorithm for context-free grammars is the CYK algorithm, which is a dynamic programming algorithm. But in practice it is very restricted, because it only works with grammars in CNF. But there are more complex dynamic programming algorithms that allow any kind of context-free grammar. Such an algorithm is the ``Earley Algorithm''\cite[p.~477]{Jurafsky2009b} which was already introduced in the critical discussion. - Semantic analysis is the second method in the chain to understand natural language and therefore important as well. There are different approaches to the analysis. One of them is the syntax-driven approach that depends on parse trees. This dependency creates a delay effect: As long as a certain peace of text cannot be parsed, it definitely can't be analyzed for it's semantic meaning either. This is not an issue for restricted environments like programming languages or a very restricted subset of a natural language's grammar. But it is a major issue for real natural language, because there already the parsing does pose significant challenges. + Semantic analysis is the second method in the chain to understand natural language, as it is presented here, and therefore important as well. There are different approaches to the analysis. One of them is the syntax-driven approach that depends on parse trees. This dependency creates a delay effect: As long as a certain peace of text cannot be parsed, it definitely can't be analyzed for it's semantic meaning either. This is not an issue for restricted environments like programming languages or a very restricted subset of a natural language's grammar. But it is a major issue for real natural language, because there already the parsing does pose significant challenges. Looking into the future both methods require substantial improvements on the algorithm side to reach a point where understanding non-restricted natural languages becomes possible. But as it is right now it is not possible to create dialog systems that interact fully natural with humans. To make any kind of language interaction, the set of possible words and sentence structures must be restricted. But even if that is given (like in a flight check-in automaton), the computer has only a finite set of possible cases. The programmer can add tons of if-clauses or comparable statements to check for different cases but in the end it's all finite so that many of the user inputs must lead to the same output or no output at all. This fact has led to the current situation in which the most interaction with a computer happens via a restricted interface in which the user can only choose from a limited set of options (by clicking on a button, selecting an item of a list, etc.). - In addition the ambiguity of natural language is a major issue. Going back to the example in the introduction, the syntax-driven semantic analysis does only work properly if the semantic meaning of the input has no ambiguity. But even than the generated meaning representation does not represent the pragmatic meaning. A dialog system is therefore far from being reached, because every input of a human can have dozens of different meanings. The intended meaning can sometimes depend on a thought that this human had while typing the input. As the computer doesn't have the ability to read thoughts, it would be impossible for the computer to determine the intended meaning of the input. + Furthermore the ambiguity of natural language is a major issue. The solution to it could lie in the understanding of the context. Even though natural language is full of ambiguity, we manage to communicate very successfully. Therefore the solution to ambiguity lies probably somewhere in our brain functionality. Cognitively-inspired methods that don't use traditional AI and First-Order logic but instead are inspired by our brain and try to understand and model natural language based on the context, might as well be the solution to ambiguity altogether. The method presented by Gnjatovic\cite{Gnjatovic2012} could be such a method. In a mission critical environment this ambiguity could lead to catastrophic results, because the computer, simply put, ``didn't get it''. This risk limits the usability of natural language communication with a computer for propably a long time to a very restricted set of use cases.