ProSem: Outline, Bib-Datei und Ausgangsversion Paper hinzugefügt.

2026-05-06 11:26:25 +02:00 · 2013-11-13 17:50:20 +01:00
parent 1fe6f198e6
commit 2e78256790
3 changed files with 668 additions and 0 deletions
--- a/prosem/prosem-ki.bib
+++ b/prosem/prosem-ki.bib
@ -0,0 +1,339 @@
+% This file was created with JabRef 2.9b2.
+% Encoding: Cp1252
+
+@INPROCEEDINGS{Brin1998,
+  author = {Brin, Sergey and Page, Lawrence},
+  title = {The Anatomy of a Large-Scale Hypertextual Web Search Engine},
+  booktitle = {Seventh World Wide Web Conference},
+  year = {1998},
+  keywords = {World Wide Web, Search Engines, Information Retrieval, PageRank, Google},
+  owner = {jim},
+  quality = {1},
+  timestamp = {2013.10.29}
+}
+
+@CONFERENCE{Clark2004,
+  author = {Clark, Stephen and Curran, James R.},
+  title = {Parsing the {WSJ} using {CCG} and Log-Linear Models},
+  booktitle = {Proceedings of the 42nd Annual Meeting of the Association for Computational
+	Linguistics},
+  year = {2004},
+  pages = {104-111},
+  owner = {jim},
+  quality = {1},
+  timestamp = {2013.10.29}
+}
+
+@CONFERENCE{Kessler1997,
+  author = {Kessler, Brett and Nunberg, Geoffrey and Schuetze, Hinrich},
+  title = {Automatic Detection of Text Genre},
+  booktitle = {Proceedings of the 35th Annual Meeting of the Association for Computational
+	Linguistics},
+  year = {1997},
+  pages = {32-38},
+  owner = {jim},
+  quality = {1},
+  timestamp = {2013.10.29}
+}
+
+@CONFERENCE{Klein2003,
+  author = {Klein, Dan and Smarr, Joseph and Nguyen, Huy and Manning, Christopher
+	D.},
+  title = {Named Entity Recognition with Character-Level Models},
+  booktitle = {Conference on Natural Learning (CoNLL)},
+  year = {2003},
+  pages = {180-183},
+  owner = {jim},
+  quality = {1},
+  timestamp = {2013.10.29}
+}
+
+@TECHREPORT{Paskin2001,
+  author = {Paskin, Mark A.},
+  title = {Cubic-time Parsing and Learning Algorithms for Grammatical Bigram
+	Models},
+  institution = {University of California},
+  year = {2001},
+  number = {UCB/CSD-01-1148},
+  month = {June},
+  abstract = {In Dependency Grammar there are head words and dependents. Each phrase
+	has only one head word. The head word determines how all of its dependents
+	may be syntactically combined with other words to form a sentence.
+	A head word and all of its dependents form a constituent. In every
+	sentence there may be one or more dependency relationships with one
+	head word each.
+	
+	Dependents that precede their head are called predependents and dependents
+	that follow their head are called postdependents.
+	
+	
+	A dependency parse consists of a set of dependency relationships that
+	satisfies three constraints: 1. Every word except one (the root)
+	is dependent to exactly one head. 2. The dependency relationships
+	are acyclic; no word is, through a sequence of dependency relationships,
+	dependent to itself. 3. When drawn as a graph above the sentence,
+	no two dependency relations cross - a property known as projectivity
+	or planarity.
+	
+	
+	The Grammatical Bigram Probability Model assumes that all the dependents
+	of a head word are independent of one another and their relative
+	order. This is a strong approximation as in full English there are
+	argument structure constraints that rely on the order of dependents.
+	This simplification allows for a reduced computational complexity
+	for parsing and learning. The grammar model falls into the class
+	of "Bilexical grammars".
+	
+	
+	A dependency parse consists of multiple spans. A span has at least
+	two words up to n words. Spans have one property: No word in the
+	span has a parent outside the span. Spans can be joined and closed.
+	To join the span one of them has to be connected (both end words
+	are connected with an edge) and both spans have to share one endword.
+	The new span will be connected if both subspans were connected. If
+	that is not the case, it can be closed by adding an edge between
+	the endwords of the new span.
+	
+	
+	Every dependency parse has a unique span decomposition. For joining
+	the left subspan has be simple. That means it has to have an edge
+	between its endwords or consist of two words only. Relying on this
+	ensures that each span is derived only once.
+	
+	
+	Every span has a signature. This signature states the indexes of its
+	endwords, if it is simple and whether the left or right endword have
+	parents within the span. Spans where both the left and right endword
+	have the parent within the string are called toplevel signatures
+	as such signatures characterize valid parses.
+	
+	
+	Parser operations take signatures as input rather than spans. They
+	produce signatures as well. SEED creates an unconnected and simple
+	span with two adjacent words. CLOSE-LEFT adds an edge between the
+	endwords and makes the left endword the parent of the right one.
+	CLOSE-RIGHT does the opposite and makes the right endword the parent
+	of the left one. These operators require that neither the left nor
+	the right endword have a parent within the span.
+	
+	
+	JOIN takes two input spans and joins them. It requires that the spans
+	share an endword (1.), the shared endword has one parent (2.) and
+	the left input is simple (3.). The JOIN rule applies only if the
+	left span doesn't start the sentence.
+	
+	
+	These operators constitute an algebra over span signatures called
+	span signature algebra. A derivation D is an expression in this algebra.
+	Like operations it evaluates to span signatures. These expressions
+	can be represented as trees where the nodes are operations. There
+	is an isomorphism between dependency parses and their corresponding
+	derivations.
+	
+	
+	Optimal derivation must consist of an operation over the results of
+	optimal sub-derivations. Therefore it is enough to record the parse
+	operation with the most likely derivation of a given signature in
+	order to reconstruct the most likely derivation of the entire sentence.
+	
+	
+	The chart-parse algorithm returns the optimal parse. It uses a subprocedure
+	called EXTRACT-OPT-PARSE that constructs the optimal parse by finding
+	the top-level signature (sigma) with maximum optimal probability
+	(pi*). It backtracks then recursively through the optimal derivation
+	defined by (omega*). If CLOSE operations are encountered edges are
+	recorded in the parse. The algorithm requires O(n<>) time and O(n<>)
+	space.},
+  owner = {jim},
+  quality = {1},
+  timestamp = {2013.10.29}
+}
+
+@INBOOK{Russel2010,
+  author = {Russel, Stuart J. and Norvig, Peter},
+  title = {Artificial intelligence: A Modern Approach},
+  booktitle = {Artificial intelligence: A Modern Approach},
+  year = {2009},
+  date = {December 11},
+  bookauthor = {Russel, Stuart J. and Norvig, Peter},
+  edition = {Third},
+  series = {Prentice-Hall series in artificial intelligence},
+  publisher = {Prentice Hall},
+  chapter = {23},
+  pages = {888-927},
+  abstract = {The first method to understanding natural language is syntactic analysis
+	or parsing. The goal is to find the phrase structure of a sequence
+	of words according to the rules of the applied grammar.
+	
+	A strict top-to-bottom or bottom-to-top parsing can be inefficient.
+	Given two sentences with the same first 10 words and a difference
+	only from the 11th word on, parsing from left-to-right would force
+	the parser to make a guess about the nature of the sentence. But
+	it doesn't know if it's right until the 11th word. From there it
+	had to backtrack and reanalyze the sentence.
+	
+	
+	To prevent that dynamic programming is used. Every analyzed substring
+	gets stored for later. Once it is discovered that for example "the
+	students in section 2 of Computer Science 101" is a noun phrase,
+	this information can be stored in a structure known as chart. Algorithms
+	that do such storing are called chart parsers. One of this chart
+	parsers is a bottom-up version called CYK algorithm after its inventors
+	John Cocke, Daniel Younger and Tadeo Kasami. This algorithm requires
+	a grammar in the Chomsky Normal Form. The algorithm takes O(n<>m)
+	space for the P table with n being the number of words in the sentence
+	and m the number of nonterminal symbols in the grammar. It takes
+	O(n<>m) time whereas m is constant for a particular grammar. That's
+	why it is commonly described as O(n<>). There is no faster algorithm
+	for general context-free grammars.
+	
+	
+	The CYK algorithm only co mputes the probability of the most probable
+	tree. The subtrees are all represented in P table.
+	
+	
+	PCFGs (Probabilistic context free grammars) have many rules with a
+	probability for each one of them. Learning the grammar from data
+	is better than a knowledge engineering approach. Learning is easiest
+	if we are given a corpus of correctly parsed sentences; commonly
+	known as a treebank. The best known treebank is the Penn Treebank
+	as it consists of 3 million words which have been annotated with
+	part of speech and parse-tree structure. Given an amount of trees,
+	a PCFG can be created just by counting and smoothing.
+	
+	
+	If no treebank is given it is still possible to learn the grammar
+	but it is more difficult. In such a case there are actually two problems:
+	First learning the structure of the grammar rules and second learning
+	the probabilities associated with them.
+	
+	
+	PCFGs have the problem that they are context-free. Combining a PCFG
+	and Markov model will get the best of both. This leads ultimately
+	to lexicalized PCFGs. But another problem of PCFGs is there preference
+	for short sentences.
+	
+	
+	Lexicalized PCFGs introduce so called head words. Such words are the
+	most important words in a phrase and the probabilities are calculated
+	between the head words. Example: "eat a banana" "eat" is the head
+	of the verb phrase "eat a banana", whereas "banana" is the head of
+	the noun phrase "a banana". Probability P1 now depends on "eat" and
+	"banana" and the result would be very high. If the head of the noun
+	phrase were "bandanna", the result would be significantly lower.
+	
+	
+	The next step are definite clause grammars. They can be used to parse
+	in a way of logical inference and makes it possible to reason about
+	languages and strings in many different ways. Furthermore augmentations
+	allow for distinctions in a single subphrase. For example the noun
+	phrase (NP) depends on the subject case and the person and number
+	of persons. A real world example would be "to smell". It is "I smell",
+	"you smell", "we smell", "you smell" and "they smell" but "he/she/it
+	smells". It depends on the person what version is taken.
+	
+	
+	Semantic interpretation is used to give sentences a meaning. This
+	is achieved through logical sentences. The semantics can be added
+	to an already augmented grammar (created during the previous step),
+	resulting in multiple augmentations at the same time. Chill is an
+	inductive logic programming program that can learn to achieve 70%
+	to 85% accuracy on various database query tasks.
+	
+	
+	But there are several complications as English is endlessly complex.
+	First there is the time at which things happened (present, past,
+	future). Second you have the so called speech act which is the speaker's
+	action that has to be deciphered by the hearer. The hearer has to
+	find out what type of action it is (a statement, a question, an order,
+	a warning, a promise and so on). Then there are so called long-distance
+	dependencies and ambiguity. The ambiguity can reach from lexical
+	ambiguity where a word has multiple usages, over syntactic ambiguity
+	where a sentence has multiple parses up to semantic ambiguity where
+	the meaning of and the same sentence can be different. Last there
+	is ambiguity between literal meaning and figurative meanings.
+	
+	
+	Finally there are four models that need to be combined to do disambiguation
+	properly: the world model, the mental model, the language model and
+	the acoustic model.
+	
+	
+	-- not so much an abstract of the specific content of that section
+	as an abstract about speech recognition in general --
+	
+	
+	The second method is speech recognition. It has the added difficulty
+	that the words are not clearly separated and every speaker can pronounce
+	the same sentence with the same meaning different. An example is
+	"The train is approaching". Another written form would be "The train's
+	approaching". Both convey the same meaning in the written language.
+	But if a BBC, a CNN and a german news anchor speeks this sentence
+	it will sound dramatically different. Speech recognition has to deal
+	with that problem to get the written text associated with the spoken
+	words. From the text the first method can than be used to analyze
+	the words and find a meaning. Finally this meaning can be used to
+	create some kind of action in a dialog system.
+	
+	
+	--
+	
+	
+	Some problems of speech recognition are segmentation, coarticulation
+	and homophones. Two used models are the acoustic model and the language
+	model. Another major model is the noisy channel model, named after
+	Claude Shannon (1948). He showed that the original message can always
+	be recovered in a noisy channel if the original message is encoded
+	in a redundant enough way.
+	
+	
+	The acoustic model in particular is used to get to the really interesting
+	parts. It is not interesting how words were spoken but more what
+	words where spoken. That means that not all available information
+	needs to be stored and a relative low sample rate is enough. 80 samples
+	at 8kHz with a frame length of about 10 milliseconds is enough for
+	that matter. To distinguish words so called phones are used. There
+	are 49 phones used in English. A phoneme is the smallest unit of
+	sound that has a distinct meaning to speakers of a particular language.
+	Back to the frames: every frame is summarized by a vector of features.
+	Features are important aspects of a speech signal. It can be compared
+	to listening to an orchestra and saying "here the French horns are
+	playing loudly and the violins are playing softly". Yet another difficulty
+	are dialect variations.
+	
+	
+	The language model should be learned from a corpus of transcripts
+	of spoken language. But such a thing is more difficult than building
+	an n-gram model of text, because it requires a hidden Markov model.
+	
+	
+	All in all speech recognition is most effective when used for a specific
+	task against a restricted set of options. A general purpose system
+	can only work accurately if it creates one model for every speaker.
+	Prominent examples like Apple's siri are therefore not very accurate.},
+  owner = {jim},
+  timestamp = {2013.10.24}
+}
+
+@INPROCEEDINGS{Sleator1993,
+  author = {Sleator, Daniel D. K. and Temperley, Davy},
+  title = {Parsing English with a Link Grammar},
+  booktitle = {Third Annual Workshop on Parsing technologies},
+  year = {1993},
+  owner = {jim},
+  quality = {1},
+  timestamp = {2013.10.29}
+}
+
+@CONFERENCE{Smith2008,
+  author = {Smith, David A. and Eisner, Jason},
+  title = {Dependency Parsing by Belief Propagation},
+  booktitle = {Conference on Empirical Methods in Natural Language Processing},
+  year = {2008},
+  date = {October 25 - October 27},
+  pages = {145-156},
+  owner = {jim},
+  quality = {1},
+  timestamp = {2013.10.29}
+}
+