uni/prosem/prosem-ki.bib

% This file was created with JabRef 2.9b2.
% Encoding: Cp1252

@INPROCEEDINGS{Brin1998,
  author = {Brin, Sergey and Page, Lawrence},
  title = {The Anatomy of a Large-Scale Hypertextual Web Search Engine},
  booktitle = {Seventh World Wide Web Conference},
  year = {1998},
  keywords = {World Wide Web, Search Engines, Information Retrieval, PageRank, Google},
  owner = {jim},
  quality = {1},
  timestamp = {2013.10.29}
}

@CONFERENCE{Clark2004,
  author = {Clark, Stephen and Curran, James R.},
  title = {Parsing the {WSJ} using {CCG} and Log-Linear Models},
  booktitle = {Proceedings of the 42nd Annual Meeting of the Association for Computational
	Linguistics},
  year = {2004},
  pages = {104-111},
  owner = {jim},
  quality = {1},
  timestamp = {2013.10.29}
}

@INBOOK{Jurafsky2009,
  chapter = {18},
  pages = {617--644},
  title = {Speech and Language Processing},
  publisher = {Pearson},
  year = {2009},
  author = {Jurafsky, Daniel and Martin, James H.},
  series = {Prentice-Hall series in artificial intelligence},
  edition = {Second},
  abstract = {Sentences get their meanings from the words they contain and the syntactic
	order of the words. Therefore the meaning of a sentence is partially
	based on the words and its syntactic structure. The composition of
	meaning representation is guided by the syntactic components and
	relations provided by grammars such as CFGs.


	A meaning representation is generated by first sending the input through
	a parser which results in the syntactic analysis and second passing
	this analysis as input to a semantic analyzer.


	In the syntax-driven semantic analysis it is assumed that syntactic,
	lexical and anaphoric ambiguities are not a problem.


	The semantic meanings are attached to the grammar rules and lexical
	entries from which trees are generated in the first place. This is
	called rule-to-rule hypothesis.


	The semantic attachments are written in braces after the syntactic
	rules themselves.


	After the syntactic analysis has been created, every word receives
	a FOL predicate and/or term. The semantic analyzer goes the tree
	up until the complete FOL term has been created. On the way lambda
	reduction is used to replace predicates and terms with their proper
	meanings, received from other parts of the tree.},
  booktitle = {Speech and Language Processing},
  owner = {jim},
  quality = {1},
  timestamp = {2013.11.16}
}

@INBOOK{Jurafsky2009a,
  chapter = {17},
  pages = {579--616},
  title = {Speech and Language Processing},
  publisher = {Pearson},
  year = {2009},
  author = {Jurafsky, Daniel and Martin, James H.},
  series = {Prentice-Hall series in artificial intelligence},
  edition = {Second},
  abstract = {Lambda notation is used to bind variables dynamically to later appearing
	contents.


	lambda x P(x)(y) results in P(y) after a lambda reduction as x has
	been bound to y.

	lambda P P(x)(lambda x Restaurant(x)) results in lambda x Restaurant(x)(x)
	which results in Restaurant(x)},
  booktitle = {Speech and Language Processing},
  owner = {jim},
  quality = {1},
  timestamp = {2013.11.16}
}

@INBOOK{Jurafsky2009b,
  chapter = {13},
  pages = {461--492},
  title = {Speech and Language Processing},
  publisher = {Pearson},
  year = {2009},
  author = {Jurafsky, Daniel and Martin, James H.},
  series = {Prentice-Hall series in artificial intelligence},
  edition = {Second},
  owner = {jim},
  quality = {1},
  timestamp = {2013.11.17}
}

@CONFERENCE{Kessler1997,
  author = {Kessler, Brett and Nunberg, Geoffrey and Schuetze, Hinrich},
  title = {Automatic Detection of Text Genre},
  booktitle = {Proceedings of the 35th Annual Meeting of the Association for Computational
	Linguistics},
  year = {1997},
  pages = {32-38},
  owner = {jim},
  quality = {1},
  timestamp = {2013.10.29}
}

@CONFERENCE{Klein2003,
  author = {Klein, Dan and Smarr, Joseph and Nguyen, Huy and Manning, Christopher
	D.},
  title = {Named Entity Recognition with Character-Level Models},
  booktitle = {Conference on Natural Learning (CoNLL)},
  year = {2003},
  pages = {180-183},
  owner = {jim},
  quality = {1},
  timestamp = {2013.10.29}
}

@TECHREPORT{Paskin2001,
  author = {Paskin, Mark A.},
  title = {Cubic-time Parsing and Learning Algorithms for Grammatical Bigram
	Models},
  institution = {University of California},
  year = {2001},
  number = {UCB/CSD-01-1148},
  month = {June},
  abstract = {In Dependency Grammar there are head words and dependents. Each phrase
	has only one head word. The head word determines how all of its dependents
	may be syntactically combined with other words to form a sentence.
	A head word and all of its dependents form a constituent. In every
	sentence there may be one or more dependency relationships with one
	head word each.

	Dependents that precede their head are called predependents and dependents
	that follow their head are called postdependents.


	A dependency parse consists of a set of dependency relationships that
	satisfies three constraints: 1. Every word except one (the root)
	is dependent to exactly one head. 2. The dependency relationships
	are acyclic; no word is, through a sequence of dependency relationships,
	dependent to itself. 3. When drawn as a graph above the sentence,
	no two dependency relations cross - a property known as projectivity
	or planarity.


	The Grammatical Bigram Probability Model assumes that all the dependents
	of a head word are independent of one another and their relative
	order. This is a strong approximation as in full English there are
	argument structure constraints that rely on the order of dependents.
	This simplification allows for a reduced computational complexity
	for parsing and learning. The grammar model falls into the class
	of "Bilexical grammars".


	A dependency parse consists of multiple spans. A span has at least
	two words up to n words. Spans have one property: No word in the
	span has a parent outside the span. Spans can be joined and closed.
	To join the span one of them has to be connected (both end words
	are connected with an edge) and both spans have to share one endword.
	The new span will be connected if both subspans were connected. If
	that is not the case, it can be closed by adding an edge between
	the endwords of the new span.


	Every dependency parse has a unique span decomposition. For joining
	the left subspan has be simple. That means it has to have an edge
	between its endwords or consist of two words only. Relying on this
	ensures that each span is derived only once.


	Every span has a signature. This signature states the indexes of its
	endwords, if it is simple and whether the left or right endword have
	parents within the span. Spans where both the left and right endword
	have the parent within the string are called toplevel signatures
	as such signatures characterize valid parses.


	Parser operations take signatures as input rather than spans. They
	produce signatures as well. SEED creates an unconnected and simple
	span with two adjacent words. CLOSE-LEFT adds an edge between the
	endwords and makes the left endword the parent of the right one.
	CLOSE-RIGHT does the opposite and makes the right endword the parent
	of the left one. These operators require that neither the left nor
	the right endword have a parent within the span.


	JOIN takes two input spans and joins them. It requires that the spans
	share an endword (1.), the shared endword has one parent (2.) and
	the left input is simple (3.). The JOIN rule applies only if the
	left span doesn't start the sentence.


	These operators constitute an algebra over span signatures called
	span signature algebra. A derivation D is an expression in this algebra.
	Like operations it evaluates to span signatures. These expressions
	can be represented as trees where the nodes are operations. There
	is an isomorphism between dependency parses and their corresponding
	derivations.


	Optimal derivation must consist of an operation over the results of
	optimal sub-derivations. Therefore it is enough to record the parse
	operation with the most likely derivation of a given signature in
	order to reconstruct the most likely derivation of the entire sentence.


	The chart-parse algorithm returns the optimal parse. It uses a subprocedure
	called EXTRACT-OPT-PARSE that constructs the optimal parse by finding
	the top-level signature (sigma) with maximum optimal probability
	(pi*). It backtracks then recursively through the optimal derivation
	defined by (omega*). If CLOSE operations are encountered edges are
	recorded in the parse. The algorithm requires O(n<>) time and O(n<>)
	space.},
  owner = {jim},
  quality = {1},
  timestamp = {2013.10.29}
}

@INBOOK{Russel2010,
  chapter = {23},
  pages = {888--927},
  title = {Artificial intelligence: A Modern Approach},
  publisher = {Pearson},
  year = {2009},
  author = {Russel, Stuart J. and Norvig, Peter},
  series = {Prentice-Hall series in artificial intelligence},
  edition = {Third},
  abstract = {The first method to understanding natural language is syntactic analysis
	or parsing. The goal is to find the phrase structure of a sequence
	of words according to the rules of the applied grammar.

	A strict top-to-bottom or bottom-to-top parsing can be inefficient.
	Given two sentences with the same first 10 words and a difference
	only from the 11th word on, parsing from left-to-right would force
	the parser to make a guess about the nature of the sentence. But
	it doesn't know if it's right until the 11th word. From there it
	had to backtrack and reanalyze the sentence.


	To prevent that dynamic programming is used. Every analyzed substring
	gets stored for later. Once it is discovered that for example "the
	students in section 2 of Computer Science 101" is a noun phrase,
	this information can be stored in a structure known as chart. Algorithms
	that do such storing are called chart parsers. One of this chart
	parsers is a bottom-up version called CYK algorithm after its inventors
	John Cocke, Daniel Younger and Tadeo Kasami. This algorithm requires
	a grammar in the Chomsky Normal Form. The algorithm takes O(n<>m)
	space for the P table with n being the number of words in the sentence
	and m the number of nonterminal symbols in the grammar. It takes
	O(n<>m) time whereas m is constant for a particular grammar. That's
	why it is commonly described as O(n<>). There is no faster algorithm
	for general context-free grammars.


	The CYK algorithm only co mputes the probability of the most probable
	tree. The subtrees are all represented in P table.


	PCFGs (Probabilistic context free grammars) have many rules with a
	probability for each one of them. Learning the grammar from data
	is better than a knowledge engineering approach. Learning is easiest
	if we are given a corpus of correctly parsed sentences; commonly
	known as a treebank. The best known treebank is the Penn Treebank
	as it consists of 3 million words which have been annotated with
	part of speech and parse-tree structure. Given an amount of trees,
	a PCFG can be created just by counting and smoothing.


	If no treebank is given it is still possible to learn the grammar
	but it is more difficult. In such a case there are actually two problems:
	First learning the structure of the grammar rules and second learning
	the probabilities associated with them.


	PCFGs have the problem that they are context-free. Combining a PCFG
	and Markov model will get the best of both. This leads ultimately
	to lexicalized PCFGs. But another problem of PCFGs is there preference
	for short sentences.


	Lexicalized PCFGs introduce so called head words. Such words are the
	most important words in a phrase and the probabilities are calculated
	between the head words. Example: "eat a banana" "eat" is the head
	of the verb phrase "eat a banana", whereas "banana" is the head of
	the noun phrase "a banana". Probability P1 now depends on "eat" and
	"banana" and the result would be very high. If the head of the noun
	phrase were "bandanna", the result would be significantly lower.


	The next step are definite clause grammars. They can be used to parse
	in a way of logical inference and makes it possible to reason about
	languages and strings in many different ways. Furthermore augmentations
	allow for distinctions in a single subphrase. For example the noun
	phrase (NP) depends on the subject case and the person and number
	of persons. A real world example would be "to smell". It is "I smell",
	"you smell", "we smell", "you smell" and "they smell" but "he/she/it
	smells". It depends on the person what version is taken.


	Semantic interpretation is used to give sentences a meaning. This
	is achieved through logical sentences. The semantics can be added
	to an already augmented grammar (created during the previous step),
	resulting in multiple augmentations at the same time. Chill is an
	inductive logic programming program that can learn to achieve 70%
	to 85% accuracy on various database query tasks.


	But there are several complications as English is endlessly complex.
	First there is the time at which things happened (present, past,
	future). Second you have the so called speech act which is the speaker's
	action that has to be deciphered by the hearer. The hearer has to
	find out what type of action it is (a statement, a question, an order,
	a warning, a promise and so on). Then there are so called long-distance
	dependencies and ambiguity. The ambiguity can reach from lexical
	ambiguity where a word has multiple usages, over syntactic ambiguity
	where a sentence has multiple parses up to semantic ambiguity where
	the meaning of and the same sentence can be different. Last there
	is ambiguity between literal meaning and figurative meanings.


	Finally there are four models that need to be combined to do disambiguation
	properly: the world model, the mental model, the language model and
	the acoustic model.


	-- not so much an abstract of the specific content of that section
	as an abstract about speech recognition in general --


	The second method is speech recognition. It has the added difficulty
	that the words are not clearly separated and every speaker can pronounce
	the same sentence with the same meaning different. An example is
	"The train is approaching". Another written form would be "The train's
	approaching". Both convey the same meaning in the written language.
	But if a BBC, a CNN and a german news anchor speeks this sentence
	it will sound dramatically different. Speech recognition has to deal
	with that problem to get the written text associated with the spoken
	words. From the text the first method can than be used to analyze
	the words and find a meaning. Finally this meaning can be used to
	create some kind of action in a dialog system.


	--


	Some problems of speech recognition are segmentation, coarticulation
	and homophones. Two used models are the acoustic model and the language
	model. Another major model is the noisy channel model, named after
	Claude Shannon (1948). He showed that the original message can always
	be recovered in a noisy channel if the original message is encoded
	in a redundant enough way.


	The acoustic model in particular is used to get to the really interesting
	parts. It is not interesting how words were spoken but more what
	words where spoken. That means that not all available information
	needs to be stored and a relative low sample rate is enough. 80 samples
	at 8kHz with a frame length of about 10 milliseconds is enough for
	that matter. To distinguish words so called phones are used. There
	are 49 phones used in English. A phoneme is the smallest unit of
	sound that has a distinct meaning to speakers of a particular language.
	Back to the frames: every frame is summarized by a vector of features.
	Features are important aspects of a speech signal. It can be compared
	to listening to an orchestra and saying "here the French horns are
	playing loudly and the violins are playing softly". Yet another difficulty
	are dialect variations.


	The language model should be learned from a corpus of transcripts
	of spoken language. But such a thing is more difficult than building
	an n-gram model of text, because it requires a hidden Markov model.


	All in all speech recognition is most effective when used for a specific
	task against a restricted set of options. A general purpose system
	can only work accurately if it creates one model for every speaker.
	Prominent examples like Apple's siri are therefore not very accurate.},
  bookauthor = {Russel, Stuart J. and Norvig, Peter},
  booktitle = {Artificial intelligence: A Modern Approach},
  date = {December 11},
  owner = {jim},
  timestamp = {2013.10.24}
}

@INPROCEEDINGS{Sleator1993,
  author = {Sleator, Daniel D. K. and Temperley, Davy},
  title = {Parsing English with a Link Grammar},
  booktitle = {Third Annual Workshop on Parsing technologies},
  year = {1993},
  owner = {jim},
  quality = {1},
  timestamp = {2013.10.29}
}

@CONFERENCE{Smith2008,
  author = {Smith, David A. and Eisner, Jason},
  title = {Dependency Parsing by Belief Propagation},
  booktitle = {Conference on Empirical Methods in Natural Language Processing},
  year = {2008},
  pages = {145-156},
  date = {October 25 - October 27},
  owner = {jim},
  quality = {1},
  timestamp = {2013.10.29}
}